In [5]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [26]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
# '[.]'을 '.'으로 복구
train['URL'] = train['URL'].str.replace(r'\[\.\]', '.', regex=True)
test['URL'] = test['URL'].str.replace(r'\[\.\]', '.', regex=True)

In [9]:
## 새로운 변수 생성
# URL 길이
train['length'] = train['URL'].str.len()
test['length'] = test['URL'].str.len()

# 서브도메인 개수
train['subdomain_count'] = train['URL'].str.split('.').apply(lambda x: len(x) - 2)
test['subdomain_count'] = test['URL'].str.split('.').apply(lambda x: len(x) - 2)

# 특수 문자('-', '_', '/') 개수
train['special_char_count'] = train['URL'].apply(lambda x: sum(1 for c in x if c in '-_/'))
test['special_char_count'] = test['URL'].apply(lambda x: sum(1 for c in x if c in '-_/'))

# 디지털 문자 관련
train['digit_count'] = train['URL'].str.count(r'\d')
test['digit_count'] = test['URL'].str.count(r'\d')
train['digit_ratio'] = train['digit_count'] / train['length']
test['digit_ratio'] = test['digit_count'] / test['length']

# 대문자 관련
train['uppercase_count'] = train['URL'].str.count(r'[A-Z]')
test['uppercase_count'] = test['URL'].str.count(r'[A-Z]')
train['uppercase_ratio'] = train['uppercase_count'] / train['length']
test['uppercase_ratio'] = test['uppercase_count'] / test['length']

# 추가 특수문자
train['abnormal_chars'] = train['URL'].str.count(r'[^a-zA-Z0-9\-\./_]')
test['abnormal_chars'] = test['URL'].str.count(r'[^a-zA-Z0-9\-\./_]')
train['dots_count'] = train['URL'].str.count(r'\.')
test['dots_count'] = test['URL'].str.count(r'\.')

# URL 구조 관련
train['path_length'] = train['URL'].apply(lambda x: len(x.split('/')[-1]) if '/' in x else 0)
test['path_length'] = test['URL'].apply(lambda x: len(x.split('/')[-1]) if '/' in x else 0)

train['query_count'] = train['URL'].str.count(r'\?')
test['query_count'] = test['URL'].str.count(r'\?')

train['and_count'] = train['URL'].str.count(r'\&')
test['and_count'] = test['URL'].str.count(r'\&')


In [19]:
# 상관계수 계산
feature_cols = ['length', 'subdomain_count', 'special_char_count',
           'digit_count', 'digit_ratio', 'uppercase_count', 'uppercase_ratio',
           'abnormal_chars', 'dots_count', 'path_length', 'query_count', 'and_count']

correlation_matrix = train[feature_cols + ['label']].corr()

# label과의 상관관계 확인
label_corr = correlation_matrix['label'].abs().sort_values(ascending=False)
print("\n특성과 label의 상관관계 (절대값 기준 내림차순):")
print(label_corr)

# 상관계수 0.3 이상인 특성 선택
selected_features = label_corr[label_corr >= 0.3].index.tolist()
selected_features.remove('label')  # label 제외
print("\n선택된 특성:", selected_features)

# 최종 데이터셋 생성
final_train = train[['ID', 'URL', 'label'] + selected_features]
final_test = test[['ID', 'URL'] + selected_features]

print("\n최종 학습 데이터 shape:", final_train.shape)
print("최종 테스트 데이터 shape:", final_test.shape)


특성과 label의 상관관계 (절대값 기준 내림차순):
label                 1.000000
digit_ratio           0.363634
special_char_count    0.356997
dots_count            0.350023
subdomain_count       0.350023
length                0.326656
digit_count           0.251722
uppercase_ratio       0.241069
path_length           0.230710
query_count           0.216031
and_count             0.119879
uppercase_count       0.113967
abnormal_chars        0.040008
Name: label, dtype: float64

선택된 특성: ['digit_ratio', 'special_char_count', 'dots_count', 'subdomain_count', 'length']

최종 학습 데이터 shape: (6995056, 8)
최종 테스트 데이터 shape: (1747689, 7)


In [22]:
final_train.to_csv('../data/preprocessed_data/final_train.csv', index=False)
final_test.to_csv('../data/preprocessed_data/final_test.csv', index=False)

In [27]:
train

Unnamed: 0,ID,URL,label
0,TRAIN_0000000,poznan[.]wuoz[.]gov[.]pl,0
1,TRAIN_0000001,vill[.]okawa[.]kochi[.]jp,0
2,TRAIN_0000002,nationalfinance[.]co[.]om,0
3,TRAIN_0000003,town[.]ozora[.]hokkaido[.]jp,0
4,TRAIN_0000004,open24[.]ie-news[.]irish/online/Login,1
...,...,...,...
6995051,TRAIN_6995051,ddht[.]co[.]kr,0
6995052,TRAIN_6995052,www[.]upstartepoxy[.]com,0
6995053,TRAIN_6995053,employeesalaryschedule70[.]000webhostapp[.]com...,1
6995054,TRAIN_6995054,dekalbtool[.]com,0


In [28]:
test

Unnamed: 0,ID,URL
0,TEST_0000000,niquelarte[.]blogspot[.]com[.]es
1,TEST_0000001,northernmetalproducts[.]com
2,TEST_0000002,ga[.]de
3,TEST_0000003,florawww[.]eeb[.]uconn[.]edu/199300089[.]html
4,TEST_0000004,activecity[.]moscow
...,...,...
1747684,TEST_1747684,culliganlaredo[.]com
1747685,TEST_1747685,tenzidetailer[.]pl
1747686,TEST_1747686,club-hippique-neubourg[.]fr
1747687,TEST_1747687,smbcyt[.]com/


In [31]:
final_train

Unnamed: 0,ID,URL,label,digit_ratio,special_char_count,dots_count,subdomain_count,length
0,TRAIN_0000000,poznan.wuoz.gov.pl,0,0.000000,0,3,2,18
1,TRAIN_0000001,vill.okawa.kochi.jp,0,0.000000,0,3,2,19
2,TRAIN_0000002,nationalfinance.co.om,0,0.000000,0,2,1,21
3,TRAIN_0000003,town.ozora.hokkaido.jp,0,0.000000,0,3,2,22
4,TRAIN_0000004,open24.ie-news.irish/online/Login,1,0.060606,3,2,1,33
...,...,...,...,...,...,...,...,...
6995051,TRAIN_6995051,ddht.co.kr,0,0.000000,0,2,1,10
6995052,TRAIN_6995052,www.upstartepoxy.com,0,0.000000,0,2,1,20
6995053,TRAIN_6995053,employeesalaryschedule70.000webhostapp.com/adb...,1,0.100000,2,2,1,50
6995054,TRAIN_6995054,dekalbtool.com,0,0.000000,0,1,0,14
