In [243]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as spst

In [244]:
train = pd.read_csv("C:/Users/User/Desktop/mini2/train.csv")
test = pd.read_csv("C:/Users/User/Desktop/mini2/test.csv")
sample = pd.read_csv("C:/Users/User/Desktop/mini2/sample_submission.csv")

In [245]:
train.dropna(axis=0, inplace = True)

In [246]:
train.columns

Index(['url_len', 'url_num_hyphens_dom', 'url_path_len', 'url_domain_len',
       'url_hostname_len', 'url_num_dots', 'url_num_underscores',
       'url_query_len', 'url_num_query_para', 'url_ip_present', 'url_entropy',
       'url_chinese_present', 'url_port', 'html_num_tags('iframe')',
       'html_num_tags('script')', 'html_num_tags('embed')',
       'html_num_tags('object')', 'html_num_tags('div')',
       'html_num_tags('head')', 'html_num_tags('body')',
       'html_num_tags('form')', 'html_num_tags('a')',
       'html_num_tags('applet')', 'label'],
      dtype='object')

In [247]:
train = train.drop([ 'url_port', 'url_chinese_present', "html_num_tags('applet')"], axis = 1)

In [248]:
train['label'] = train['label'].map({'malicious': 1, 'benign' : 0})

In [249]:
from sklearn.model_selection import train_test_split

# Target 확인
target = 'label'

# 데이터 분리
x = train.drop(target, axis=1)
y = train.loc[:, target]

# 학습용, 평가용 데이터 8:2으로 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=1)

In [250]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# 선언하기
model = DecisionTreeClassifier(random_state=1)

# 성능검증
cv_score = cross_val_score(model, x_train, y_train, cv=10)

# 확인
print('성능:', cv_score.round(2))
print('평균:', cv_score.mean().round(2))

성능: [0.94 0.93 0.93 0.93 0.94 0.93 0.92 0.95 0.94 0.93]
평균: 0.93


In [251]:
# 불러오기
from sklearn.metrics import confusion_matrix, classification_report

# 선언하기
model = DecisionTreeClassifier(max_depth=5, random_state=1)

# 학습하기
model.fit(x_train, y_train)

# 예측하기
y_pred = model.predict(x_test)

# 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[258  16]
 [ 25 251]]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       274
           1       0.94      0.91      0.92       276

    accuracy                           0.93       550
   macro avg       0.93      0.93      0.93       550
weighted avg       0.93      0.93      0.93       550



In [219]:
# 기본 모델 선언
model_dt = DecisionTreeClassifier(random_state=1)

# 파라미터 선언
params = {'max_depth': range(1, 51)}

# 모델 선언
model = GridSearchCV(model_dt,
                     params,
                     cv=5, 
                     scoring='f1')   

In [220]:
model.fit(x_train, y_train)

In [221]:
# 예측 결과 확인
print(model.best_params_)
print(model.best_score_) 

{'max_depth': 12}
0.9398145994757512


In [252]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

In [253]:
# 선언하기
model = RandomForestClassifier()

# 학습하기
model.fit(x_train, y_train)

# 예측하기
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       274
           1       0.98      0.96      0.97       276

    accuracy                           0.97       550
   macro avg       0.97      0.97      0.97       550
weighted avg       0.97      0.97      0.97       550



In [254]:
# 기본 모델 선언
model_dt = RandomForestClassifier(random_state=1)

# 파라미터 선언
params = {'max_depth': range(1, 51)}

# 모델 선언
model = GridSearchCV(model_dt,
                     params,
                     cv=5, 
                     scoring='f1_micro')   

In [255]:
model.fit(x_train, y_train)

In [256]:
# 예측 결과 확인
print(model.best_params_)
print(model.best_score_) 

{'max_depth': 15}
0.9620816193813774


In [227]:
columns = x_train.columns

In [228]:
test2 = test[columns]

In [229]:
test2.isna().sum()

url_len                      0
url_num_hyphens_dom          0
url_path_len               466
url_domain_len             466
url_hostname_len           463
url_num_dots                 0
url_num_underscores          0
url_query_len                0
url_num_query_para           0
url_ip_present               0
url_entropy                  0
html_num_tags('iframe')      0
html_num_tags('script')    467
html_num_tags('embed')       0
html_num_tags('object')      0
html_num_tags('div')         0
html_num_tags('head')        0
html_num_tags('body')        0
html_num_tags('form')      467
html_num_tags('a')         461
dtype: int64

In [230]:
test2["html_num_tags('a')"] = test2["html_num_tags('a')"].fillna(test2["html_num_tags('a')"].mean())
test2["url_path_len"] = test2["url_path_len"].fillna(test2["url_path_len"].mean())
test2["url_domain_len"] = test2["url_domain_len"].fillna(test2["url_domain_len"].mean())
test2["url_hostname_len"] = test2["url_hostname_len"].fillna(test2["url_hostname_len"].mean())
test2["html_num_tags('script')"] = test2["html_num_tags('script')"].fillna(test2["html_num_tags('script')"].mean())
test2["html_num_tags('form')"] = test2["html_num_tags('form')"].fillna(test2["html_num_tags('form')"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2["html_num_tags('a')"] = test2["html_num_tags('a')"].fillna(test2["html_num_tags('a')"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2["url_path_len"] = test2["url_path_len"].fillna(test2["url_path_len"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2["url_domain_len

In [231]:
test2.isna().sum()

url_len                    0
url_num_hyphens_dom        0
url_path_len               0
url_domain_len             0
url_hostname_len           0
url_num_dots               0
url_num_underscores        0
url_query_len              0
url_num_query_para         0
url_ip_present             0
url_entropy                0
html_num_tags('iframe')    0
html_num_tags('script')    0
html_num_tags('embed')     0
html_num_tags('object')    0
html_num_tags('div')       0
html_num_tags('head')      0
html_num_tags('body')      0
html_num_tags('form')      0
html_num_tags('a')         0
dtype: int64

In [232]:
X_test = test2

In [233]:
y_pred = model.predict(X_test)

In [236]:
sub_label = pd.Series(y_pred).map({1 : 'malicious', 0 : 'benign'})

In [239]:
sample = pd.read_csv("C:/Users/User/Desktop/mini2/sample_submission.csv")
sample['label'] = sub_label
sample = sample.set_index('id')

In [240]:
sample.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
0,benign
1,benign
2,benign
3,malicious
4,benign


In [242]:
sample.to_csv("C:/Users/User/Desktop/mini2/submission3.csv")

1