## 기본적인 세팅

In [46]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
import joblib

## 데이터셋 불러오기

In [47]:
#csv 파일 읽어오기
data = pd.read_csv('./one_to_one.csv')
data.drop('url', axis=1,inplace=True)
data.drop('status_code', axis=1,inplace=True)

#XGBoost 내에서 ]와 [를 못 읽는다. 이에 해당을 =+ +=로 각각 수정
data.rename(columns={']':'=+'},inplace=True)
data.rename(columns={'[':'+='},inplace=True)

In [48]:
#X에 모든 피처들에 대한 값을 저장
X = data.drop('type', axis=1)
#Y에 각 피처에 대한 결과값을 저장
y = data['type']

# 8:2로 train과 test로 구분
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(598, 27) (150, 27) (598,) (150,)


## XGBoost 적용하기

In [49]:
#XGBoost에서 처리하기 위한 데이터 세트
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dtest = xgb.DMatrix(data = X_test, label = y_test)

In [50]:
#하이퍼 파라미터 세팅
params = {'max_depth': 3,
          'eta': 0.1,
          'eval_metric':'logloss',
          }
num_rounds = 400

wlist = [(dtrain,'train'),(dtest,'test')]

In [51]:
xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = num_rounds, early_stopping_rounds=25, evals = wlist)

[0]	train-logloss:0.59965	test-logloss:0.60296
[1]	train-logloss:0.52244	test-logloss:0.52878
[2]	train-logloss:0.45777	test-logloss:0.46692
[3]	train-logloss:0.40297	test-logloss:0.41478
[4]	train-logloss:0.35615	test-logloss:0.37046
[5]	train-logloss:0.31584	test-logloss:0.33255
[6]	train-logloss:0.28094	test-logloss:0.29995
[7]	train-logloss:0.25057	test-logloss:0.27180
[8]	train-logloss:0.22404	test-logloss:0.24743
[9]	train-logloss:0.20079	test-logloss:0.22626
[10]	train-logloss:0.18035	test-logloss:0.20786
[11]	train-logloss:0.16233	test-logloss:0.19183
[12]	train-logloss:0.14642	test-logloss:0.17787
[13]	train-logloss:0.13235	test-logloss:0.16570
[14]	train-logloss:0.11988	test-logloss:0.15510
[15]	train-logloss:0.10882	test-logloss:0.14588
[16]	train-logloss:0.09900	test-logloss:0.13786
[17]	train-logloss:0.09028	test-logloss:0.13090
[18]	train-logloss:0.08247	test-logloss:0.12497
[19]	train-logloss:0.07554	test-logloss:0.11978
[20]	train-logloss:0.06931	test-logloss:0.11541
[2

In [40]:
pred_probs = xgb_model.predict(dtest)
print(np.round(pred_probs[:10],3))
preds = [1 if x> 0.5 else 0 for x in pred_probs]
print("예측값 10개만 표시 : ", preds[:10])
print("본래의 값",dtest.get_label()[:10])

[ 1.001e+00  0.000e+00  0.000e+00  5.000e-03  4.000e-03  1.001e+00
 -1.000e-03  2.900e-02  9.920e-01  1.001e+00]
예측값 10개만 표시 :  [1, 0, 0, 0, 0, 1, 0, 0, 1, 1]
본래의 값 [1. 0. 0. 0. 0. 1. 0. 0. 1. 1.]


In [52]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [53]:
get_clf_eval(y_test, preds)

오차행렬:
 [[81  0]
 [ 3 66]]

정확도: 0.9800
정밀도: 1.0000
재현율: 0.9565
F1: 0.9778
AUC: 0.9783


In [54]:
len(X_train.columns)

27

In [55]:
df = xgb.DMatrix(pd.DataFrame([np.zeros(27)],columns = X_train.columns))

In [56]:
#판단
w_pred = xgb_model.predict(df)
w_pred

array([0.00461719], dtype=float32)

In [57]:
joblib.dump(xgb_model,"./XGBoost_model_2.pkl")

['./XGBoost_model_2.pkl']

In [23]:
X_test.columns

Index(['http', 'https', 'www', 'IP', 'short_url', '!', '*', ''', '(', ')', ';',
       ':', '@', '&', '=', '+', '$', '"', ',', '/', '?', '%', '#', '+=', '=+',
       'total_len', 'tld_len'],
      dtype='object')