### 기본 세팅

In [43]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
# 임베딩 배열을 데이터프레임으로 변환
def make_df(data):

    array = data[0]     # 첫번째 array
    error = []          # 에러 인덱스 리스트


    for index in range(len(data)-1):
        plus_array = data[index+1]     # 추가할 array
        try:
            array = np.concatenate((array, plus_array), axis=0)
        except:
            print(index+1, '번 인덱스 오류 처리 완료')
            error.append(index+1)     # 에러 인덱스 저장

        # 201건부터는 모두 보이스피싱 텍스트 임베딩값
        # 보이스피싱 여부를 붙이고자 phishing_check로 인덱스 지정
        if (index+1) == 199:
            phishing_check = array.shape[0] + 1


    # 데이터프레임 변환 및 레이블 지정
    df = pd.DataFrame(array)
    df['phishing'] = 0                        # 상담 텍스트 임베딩값
    df.loc[phishing_check:,'phishing'] = 1    # 보이스파싱 텍스트 임베딩값
    return df

### train, validation 분할

In [46]:
def data_split(df):
    feature = df.iloc[:,:-1]      # 피처 칼럼: 예측에 사용되는 데이터
    target  = df['phishing']      # 타겟 칼럼: 예측(분류) 대상 데이터
    x_train, x_valid, y_train, y_valid = train_test_split(feature, target, test_size=0.2,
                                                          shuffle=True,
                                                          stratify=target,
                                                          random_state=1234)
    return x_train, x_valid, y_train, y_valid, feature, target

### 분류 모델 세팅

In [47]:
# 분류 score 계산
def score(pred, y_test):
    print(' accuracy  : ', accuracy_score(y_test, pred))
    print(' f1-score  : ', f1_score(y_test, pred))
    print(' recall    : ', recall_score(y_test, pred))
    print(' precision : ', precision_score(y_test, pred))
    return accuracy_score(y_test, pred), f1_score(y_test, pred), recall_score(y_test, pred), precision_score(y_test, pred)

In [48]:
# score result에 결과 추가
def result_append(score_result, scores):
    score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)
    return score_result

In [49]:
# 최종 결과 리스트
score_result = pd.DataFrame(columns=['logistic_Acc', 'logistic_F1', 'logistic_Rec', 'logistic_Pre',
                                     'naivebayes_Acc', 'naivebayes_F1', 'naivebayes_Rec', 'naivebayes_Pre',
                                     'randomforest_Acc', 'logistic_F1', 'naivebayes_Rec', 'randomforest_Pre',
                                     'logistic_5fold_Acc', 'logistic_5fold_F1', 'logistic_5fold_Rec', 'logistic_5fold_Pre',
                                     'naivebayes_5Fold_Acc', 'naivebayes_5Fold_F1', 'naivebayes_5Fold_Rec', 'naivebayes_5Fold_Pre',
                                     'randomforest_5Fold_Acc', 'randomforest_5Fold_F1', 'randomforest_5Fold_Rec', 'randomforest_5Fold_Pre'])

### Logistic Regression

In [50]:
def logistic_reg(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    reg = LogisticRegression(random_state=0, max_iter=500)
    reg.fit(x_train, y_train)
    pred = reg.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### Naive Bayes Classification

In [51]:
def naivebayes_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    pred = gnb.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### RandomForest Classification

In [52]:
def randomforest_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### K-Fold Cross Validation

In [53]:
def kfold_clf(model, feature, target):

    # 5개 폴드세트를 분리하는 kFold 객체와
    # 폴드세트별 score 값을 담을 리스트
    kfold = KFold(n_splits=5, random_state=0, shuffle=True)
    cv_accuracy = []
    cv_f1score = []
    cv_precision = []
    cv_recall = []


    n_iter = 0
    for train_index, validate_index in kfold.split(feature):
        # 데이터셋 지정
        X_train, X_valid = feature.iloc[train_index,:], feature.iloc[validate_index,:]
        y_train, y_valid = target[train_index], target[validate_index]


        # 학습 및 예측
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        n_iter += 1


        # 데이터 size 확인
        train_size = X_train.shape[0]
        test_size = X_valid.shape[0]
        print(f' ========= {n_iter} =========')


        # iteration마다 score 측정
        accuracy,f1score,precision, recall = score(pred, y_valid)


        # iteration 별 score 값 저장
        cv_accuracy.append(accuracy)
        cv_f1score.append(f1score)
        cv_precision.append(precision)
        cv_recall.append(recall)


    # 개별 iteration별 정확도를 합한 평균 계산
    print(' ======== 최종 ========')
    print(' 평균검증 정확도   : ', np.mean(cv_accuracy))
    print(' 평균검증 f1-score : ', np.mean(cv_f1score))
    print(' 평균검증 정밀도   : ', np.mean(cv_precision))
    print(' 평균검증 재현율   : ', np.mean(cv_recall))
    print(' \n')
    return np.mean(cv_accuracy), np.mean(cv_f1score), np.mean(cv_precision), np.mean(cv_recall)

### 전체 분류 모델 통합

In [54]:
def total_model(x_train,y_train,test_x,test_y):

    
    score_list = []
    print(' 데이터 가공 완료\n')


    # Logistic Regression
    print('      ----------    Logistic Regression Result    ----------      ')
    logi1, logi2, logi3, logi4 = logistic_reg(x_train, y_train, test_x, test_y)
    score_list += [logi1, logi2, logi3, logi4]


    # Naive Bayes Classification
    print('      ---------- Naive Bayes Classification Result ----------      ')
    naive1, naive2, naive3, naive4 = naivebayes_clf(x_train, y_train, test_x, test_y)
    score_list += [naive1, naive2, naive3, naive4]


    # RandomForest Classification
    print('      ---------- RandomForest Classifcation Result ----------      ')
    rf1, rf2, rf3, rf4 = randomforest_clf(x_train, y_train, test_x, test_y)
    score_list += [rf1, rf2, rf3, rf4]


    # 5-Fold Logistic Regression
    print(' \n')
    print('   ----------    [5-Fold] Logistic Regression Result    ----------   ')
    lgb = LogisticRegression(random_state=0, max_iter=500)
    foldlogi1, foldlogi2, foldlogi3, foldlogi4 = kfold_clf(lgb, test_x, test_y)
    score_list += [foldlogi1, foldlogi2, foldlogi3, foldlogi4]


    # 5-Fold NaiveBayes
    print('   ---------- [5-Fold] Naive Bayes Classification Result ----------   ')
    gnb = GaussianNB()
    foldnb1, foldnb2, foldnb3, foldnb4 = kfold_clf(gnb, test_x, test_y)
    score_list += [foldnb1, foldnb2, foldnb3, foldnb4]


    # 5-Fold RandomForest Classification
    print('   ---------- [5-Fold] RandomForest Classifcation Result ----------   ')
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    foldrf1, foldrf2, foldrf3, foldrf4 = kfold_clf(clf, test_x, test_y)
    score_list += [foldrf1, foldrf2, foldrf3, foldrf4]
    return score_list

### 모델 이용 데이터셋 분류: original

In [55]:
# 데이터 로드
origin_2gram = np.load('./path/original_2gram_embedding.npy', allow_pickle=True)
origin_3gram = np.load('./path/original_3gram_embedding.npy', allow_pickle=True)
origin_4gram = np.load('./path/original_4gram_embedding.npy', allow_pickle=True)

In [56]:
# original_2gram
ori_2g = total_model(origin_2gram)
score_result = result_append(score_result, ori_2g)

104 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.8026895432413633
 f1-score  :  0.7186776859504133
 recall    :  0.7160737812911726
 precision :  0.721300597213006
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.6760955251565036
 f1-score  :  0.6190346332151624
 recall    :  0.7476943346508564
 precision :  0.5281526291298279
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.7651286807326687
 f1-score  :  0.6243974786800148
 recall    :  0.5546772068511199
 precision :  0.714164546225615
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.790864827266404
 f1-score  :  0.7050359712230215
 recall    :  0.7050359712230215
 precision :  0.7050359712230215
 accuracy  :  0.8080222582888941
 f1-score  :  0.7288801571709234
 recall    :  0.7395348837209302
 precision :  0.718528082633957

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [57]:
# original_3gram
ori_3g = total_model(origin_3gram)
score_result = result_append(score_result, ori_3g)

64 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.83843617522374
 f1-score  :  0.7792792792792793
 recall    :  0.812751677852349
 precision :  0.7484548825710754
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7105511069241639
 f1-score  :  0.6525303929884083
 recall    :  0.774496644295302
 precision :  0.563751831949194
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.796514366462553
 f1-score  :  0.6914285714285714
 recall    :  0.6496644295302013
 precision :  0.7389312977099237
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.8280734809232219
 f1-score  :  0.7533783783783783
 recall    :  0.7652711050102952
 precision :  0.7418496340652029
 accuracy  :  0.8228921337729628
 f1-score  :  0.758974358974359
 recall    :

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [58]:
# original_4gram
ori_4g = total_model(origin_4gram)
score_result = result_append(score_result, ori_4g)

64 번 인덱스 오류 처리 완료
91 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
131 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.8454175640105288
 f1-score  :  0.7827841291190317
 recall    :  0.7967145790554415
 precision :  0.7693324520819563
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7202680067001676
 f1-score  :  0.6596797671033477
 recall    :  0.7754962354551677
 precision :  0.5739614994934144
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8224455611390284
 f1-score  :  0.7309644670050761
 recall    :  0.6899383983572895
 precision :  0.7771781033153431
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.8492462311557789
 f1-score  :  0.783356258596974
 recall    :  0.8021126760563381
 precision :  0.7654569892473119
 accuracy  :  0.8415888968652788
 f

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [59]:
# 나온 결과 확인
score_result

Unnamed: 0,logistic_Acc,logistic_F1,logistic_Rec,logistic_Pre,naivebayes_Acc,naivebayes_F1,naivebayes_Rec,naivebayes_Pre,randomforest_Acc,logistic_F1.1,...,logistic_5fold_Rec,logistic_5fold_Pre,naivebayes_5Fold_Acc,naivebayes_5Fold_F1,naivebayes_5Fold_Rec,naivebayes_5Fold_Pre,randomforest_5Fold_Acc,randomforest_5Fold_F1,randomforest_5Fold_Rec,randomforest_5Fold_Pre
0,0.80269,0.718678,0.716074,0.721301,0.676096,0.619035,0.747694,0.528153,0.765129,0.624397,...,0.732543,0.714478,0.677116,0.618273,0.743248,0.529307,0.770971,0.637507,0.572475,0.719289
1,0.838436,0.779279,0.812752,0.748455,0.710551,0.65253,0.774497,0.563752,0.796514,0.691429,...,0.781084,0.74603,0.707684,0.648572,0.769117,0.560772,0.796957,0.689493,0.642843,0.74352
2,0.845418,0.782784,0.796715,0.769332,0.720268,0.65968,0.775496,0.573961,0.822446,0.730964,...,0.808992,0.76853,0.731237,0.672366,0.788459,0.58613,0.825244,0.738567,0.705796,0.774678


### 모델 이용 데이터셋 분류: eng

In [60]:
# 데이터 로드
eng_ratio5_2gram = np.load('./path/eng_ratio5_2gram_embedding.npy', allow_pickle=True)
eng_ratio5_3gram = np.load('./path/eng_ratio5_3gram_embedding.npy', allow_pickle=True)
eng_ratio5_4gram = np.load('./path/eng_ratio5_4gram_embedding.npy', allow_pickle=True)

eng_ratio10_2gram = np.load('./path/eng_ratio10_2gram_embedding.npy', allow_pickle=True)
eng_ratio10_3gram = np.load('./path/eng_ratio10_3gram_embedding.npy', allow_pickle=True)
eng_ratio10_4gram = np.load('./path/eng_ratio10_4gram_embedding.npy', allow_pickle=True)

In [61]:
# eng_ratio5_2gram
eng_r5_2g = total_model(eng_ratio5_2gram)
score_result = result_append(score_result, eng_r5_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9230769230769231
 f1-score  :  0.9239230064161321
 recall    :  0.9218106995884774
 precision :  0.9260450160771704
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7511584800741428
 f1-score  :  0.7468175388967467
 recall    :  0.7242798353909465
 precision :  0.7708029197080292
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8635310472659871
 f1-score  :  0.8658008658008658
 recall    :  0.8687700045724737
 precision :  0.8628519527702089
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9219184430027804
 f1-score  :  0.9225821272685505
 recall    :  0.9177330895795247
 precision :  0.9274826789838337
 accuracy  :  0.9230769230769231
 f1-score  :  0.9246139872842869
 recall    :  0.9275626423690205
 precision :  0.921684019918

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [62]:
# eng_ratio5_3gram
eng_r5_3g = total_model(eng_ratio5_3gram)
score_result = result_append(score_result, eng_r5_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9548128971522711
 f1-score  :  0.9555966697502313
 recall    :  0.9564814814814815
 precision :  0.9547134935304991
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7905389503412568
 f1-score  :  0.7925407925407926
 recall    :  0.7870370370370371
 precision :  0.7981220657276995
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8985643680866087
 f1-score  :  0.9009878244888583
 recall    :  0.9078703703703703
 precision :  0.8942088463292294
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9416333254883502
 f1-score  :  0.9422180801491147
 recall    :  0.9461862423958821
 precision :  0.9382830626450116
 accuracy  :  0.9510473052482937
 f1-score  :  0.9523809523809523
 rec

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [63]:
# eng_ratio5_4gram
eng_r5_4g = total_model(eng_ratio5_4gram)
score_result = result_append(score_result, eng_r5_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9720229555236729
 f1-score  :  0.9726443768996961
 recall    :  0.9751523675574308
 precision :  0.9701492537313433
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.8137254901960784
 f1-score  :  0.814920408648135
 recall    :  0.804031879981247
 precision :  0.8261078998073218
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9167862266857962
 f1-score  :  0.9194817214252661
 recall    :  0.9315518049695265
 precision :  0.9077204202832344
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.966284074605452
 f1-score  :  0.966579758236549
 recall    :  0.9672675521821632
 precision :  0.9658929417337755
 accuracy  :  0.9751315160210425
 f1

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [64]:
# eng_ratio10_2gram
eng_r10_2g = total_model(eng_ratio10_2gram)
score_result = result_append(score_result, eng_r10_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9230769230769231
 f1-score  :  0.9238532110091744
 recall    :  0.9208962048468221
 precision :  0.926829268292683
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7493049119555144
 f1-score  :  0.7466042154566745
 recall    :  0.7288523090992227
 precision :  0.7652424387902065
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8591288229842446
 f1-score  :  0.8615034168564921
 recall    :  0.8646547782350251
 precision :  0.8583749432591921
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9212233549582948
 f1-score  :  0.9219108865411115
 recall    :  0.9172760511882998
 precision :  0.9265927977839336
 accuracy  :  0.9198331788693235
 f1-score  :  0.9213636363636364
 recall    :  0.9234624145785877
 precision :  0.9192743764172

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [65]:
# eng_ratio10_3gram
eng_r10_3g = total_model(eng_ratio10_3gram)
score_result = result_append(score_result, eng_r10_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.953165450694281
 f1-score  :  0.9539031735001158
 recall    :  0.9532407407407407
 precision :  0.9545665275846082
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7841845140032949
 f1-score  :  0.7877806063411248
 recall    :  0.787962962962963
 precision :  0.7875983341045812
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8938573782066369
 f1-score  :  0.8966307586523035
 recall    :  0.9055555555555556
 precision :  0.887880163413527
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9456342668863262
 f1-score  :  0.9459901800327332
 recall    :  0.9466541881141788
 precision :  0.9453271028037383
 accuracy  :  0.9510473052482937
 f1-score  :  0.9524680073126143
 recall

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [66]:
# eng_ratio10_4gram
eng_r10_4g = total_model(eng_ratio10_4gram)
score_result = result_append(score_result, eng_r10_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9681970349115255
 f1-score  :  0.9689034369885433
 recall    :  0.9714017815283638
 precision :  0.9664179104477612
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.810856049736968
 f1-score  :  0.8137508829762186
 recall    :  0.810126582278481
 precision :  0.8174077578051088
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.912960306073649
 f1-score  :  0.9156626506024096
 recall    :  0.9263947491795593
 precision :  0.9051763628034815
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9672405547584888
 f1-score  :  0.9675124496087266
 recall    :  0.967741935483871
 precision :  0.9672830725462305
 accuracy  :  0.9734576757532282
 f1

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [67]:
# 나온 결과 확인
score_result

Unnamed: 0,logistic_Acc,logistic_F1,logistic_Rec,logistic_Pre,naivebayes_Acc,naivebayes_F1,naivebayes_Rec,naivebayes_Pre,randomforest_Acc,logistic_F1.1,...,logistic_5fold_Rec,logistic_5fold_Pre,naivebayes_5Fold_Acc,naivebayes_5Fold_F1,naivebayes_5Fold_Rec,naivebayes_5Fold_Pre,randomforest_5Fold_Acc,randomforest_5Fold_F1,randomforest_5Fold_Rec,randomforest_5Fold_Pre
0,0.80269,0.718678,0.716074,0.721301,0.676096,0.619035,0.747694,0.528153,0.765129,0.624397,...,0.732543,0.714478,0.677116,0.618273,0.743248,0.529307,0.770971,0.637507,0.572475,0.719289
1,0.838436,0.779279,0.812752,0.748455,0.710551,0.65253,0.774497,0.563752,0.796514,0.691429,...,0.781084,0.74603,0.707684,0.648572,0.769117,0.560772,0.796957,0.689493,0.642843,0.74352
2,0.845418,0.782784,0.796715,0.769332,0.720268,0.65968,0.775496,0.573961,0.822446,0.730964,...,0.808992,0.76853,0.731237,0.672366,0.788459,0.58613,0.825244,0.738567,0.705796,0.774678
3,0.923077,0.923923,0.921811,0.926045,0.751158,0.746818,0.72428,0.770803,0.863531,0.865801,...,0.923836,0.923268,0.753383,0.751539,0.736221,0.767551,0.866311,0.868395,0.870649,0.866235
4,0.954813,0.955597,0.956481,0.954713,0.790539,0.792541,0.787037,0.798122,0.898564,0.900988,...,0.953203,0.952394,0.791828,0.792048,0.780076,0.804419,0.894831,0.897044,0.901727,0.892472
5,0.972023,0.972644,0.975152,0.970149,0.813725,0.81492,0.804032,0.826108,0.916786,0.919482,...,0.972385,0.970658,0.811498,0.812991,0.803568,0.822752,0.917683,0.919768,0.925595,0.914058
6,0.923077,0.923853,0.920896,0.926829,0.749305,0.746604,0.728852,0.765242,0.859129,0.861503,...,0.922839,0.920756,0.750278,0.750443,0.741169,0.759997,0.863763,0.866321,0.871358,0.861384
7,0.953165,0.953903,0.953241,0.954567,0.784185,0.787781,0.787963,0.787598,0.893857,0.896631,...,0.953122,0.953771,0.787497,0.789211,0.782777,0.795773,0.892336,0.894646,0.89976,0.889607
8,0.968197,0.968903,0.971402,0.966418,0.810856,0.813751,0.810127,0.817408,0.91296,0.915663,...,0.969127,0.970931,0.807528,0.810124,0.805279,0.815089,0.915865,0.918117,0.925211,0.911151


### 모델 이용 데이터셋 분류: kor

In [69]:
# 데이터 로드
kor_ratio5_2gram = np.load('./path/kor_ratio5_2gram_embedding.npy', allow_pickle=True)
kor_ratio5_3gram = np.load('./path/kor_ratio5_3gram_embedding.npy', allow_pickle=True)
kor_ratio5_4gram = np.load('./path/kor_ratio5_4gram_embedding.npy', allow_pickle=True)

kor_ratio10_2gram = np.load('./path/kor_ratio10_2gram_embedding.npy', allow_pickle=True)
kor_ratio10_3gram = np.load('./path/kor_ratio10_3gram_embedding.npy', allow_pickle=True)
kor_ratio10_4gram = np.load('./path/kor_ratio10_4gram_embedding.npy', allow_pickle=True)

In [70]:
# kor_ratio5_2gram
kor_r5_2g = total_model(kor_ratio5_2gram)
score_result = result_append(score_result, kor_r5_2g)

104 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.7992116856016693
 f1-score  :  0.7143799472295513
 recall    :  0.7134387351778656
 precision :  0.7153236459709379
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.6886158126594019
 f1-score  :  0.6317521250342748
 recall    :  0.758893280632411
 precision :  0.5410991075622358
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.7667516809645258
 f1-score  :  0.6333819241982508
 recall    :  0.572463768115942
 precision :  0.7088091353996737
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.7941108277301182
 f1-score  :  0.7111255692908264
 recall    :  0.7148463047743623
 precision :  0.7074433656957929
 accuracy  :  0.8003709714815673
 f1-score  :  0.7179823124795283
 recall    :  0.7282392026578073
 precision :  0.70801033591731

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [71]:
# kor_ratio5_3gram
kor_r5_3g = total_model(kor_ratio5_3gram)
score_result = result_append(score_result, kor_r5_3g)

64 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.833254828073481
 f1-score  :  0.7717601547388782
 recall    :  0.8033557046979866
 precision :  0.7425558312655087
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7183231276495525
 f1-score  :  0.6625282167042889
 recall    :  0.7879194630872484
 precision :  0.5715676728334956
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.796514366462553
 f1-score  :  0.700831024930748
 recall    :  0.6791946308724832
 precision :  0.7238912732474965
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.826189354686764
 f1-score  :  0.7501692620176031
 recall    :  0.7604667124227865
 precision :  0.7401469605878424
 accuracy  :  0.8257183231276496
 f1-score  :  0.7637292464878672
 recall  

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [72]:
# kor_ratio5_4gram
kor_r5_4g = total_model(kor_ratio5_4gram)
score_result = result_append(score_result, kor_r5_4g)

64 번 인덱스 오류 처리 완료
91 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
131 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.8396745632926538
 f1-score  :  0.7733423545331529
 recall    :  0.7823408624229979
 precision :  0.7645484949832776
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7284039243838238
 f1-score  :  0.6686131386861314
 recall    :  0.783709787816564
 precision :  0.5829938900203666
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8332136874850443
 f1-score  :  0.7534488857446056
 recall    :  0.728952772073922
 precision :  0.7796486090775988
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.8499641062455133
 f1-score  :  0.7841652323580035
 recall    :  0.8021126760563381
 precision :  0.767003367003367
 accuracy  :  0.8406317300789663
 f1-

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [73]:
# kor_ratio10_2gram
kor_r10_2g = total_model(kor_ratio10_2gram)
score_result = result_append(score_result, kor_r10_2g)

104 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.7978205425457918
 f1-score  :  0.7140983606557377
 recall    :  0.717391304347826
 precision :  0.7108355091383812
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.6862972408996059
 f1-score  :  0.6296194908294553
 recall    :  0.7575757575757576
 precision :  0.5386416861826698
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.7723162531880362
 f1-score  :  0.6444605358435916
 recall    :  0.5862977602108037
 precision :  0.7154340836012861
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.7894736842105263
 f1-score  :  0.7057679844458847
 recall    :  0.7122302158273381
 precision :  0.6994219653179191
 accuracy  :  0.7992116856016693
 f1-score  :  0.7143799472295514
 recall    :  0.7196013289036545
 precision :  0.7092337917485

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [74]:
# kor_ratio10_3gram
kor_r10_3g = total_model(kor_ratio10_3gram)
score_result = result_append(score_result, kor_r10_3g)

64 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.833254828073481
 f1-score  :  0.7714654615881213
 recall    :  0.802013422818792
 precision :  0.7431592039800995
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7164390014130947
 f1-score  :  0.6598870056497175
 recall    :  0.7838926174496644
 precision :  0.5697560975609756
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8052284503061705
 f1-score  :  0.7129468934397779
 recall    :  0.689261744966443
 precision :  0.7383177570093458
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.8219500706547339
 f1-score  :  0.7437288135593221
 recall    :  0.7529169526424159
 precision :  0.7347622237106497
 accuracy  :  0.82312764955252
 f1-score  :  0.75968
 recall    :  0.77989

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


In [75]:
# kor_ratio10_4gram
kor_r10_4g = total_model(kor_ratio10_4gram)
score_result = result_append(score_result, kor_r10_4g)

64 번 인덱스 오류 처리 완료
91 번 인덱스 오류 처리 완료
104 번 인덱스 오류 처리 완료
116 번 인덱스 오류 처리 완료
130 번 인덱스 오류 처리 완료
131 번 인덱스 오류 처리 완료
132 번 인덱스 오류 처리 완료
134 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.8423067719550131
 f1-score  :  0.7775902801214984
 recall    :  0.7885010266940452
 precision :  0.7669773635153129
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7293610911701364
 f1-score  :  0.670933954029677
 recall    :  0.7891854893908282
 precision :  0.583502024291498
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8262742282842785
 f1-score  :  0.742735648476258
 recall    :  0.7173169062286106
 precision :  0.7700220426157237
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.8427853553481695
 f1-score  :  0.7753846153846154
 recall    :  0.7985915492957747
 precision :  0.7534883720930232
 accuracy  :  0.8403924383823881
 f1-

  score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)


### score 파일 저장

In [76]:
score_result

Unnamed: 0,logistic_Acc,logistic_F1,logistic_Rec,logistic_Pre,naivebayes_Acc,naivebayes_F1,naivebayes_Rec,naivebayes_Pre,randomforest_Acc,logistic_F1.1,...,logistic_5fold_Rec,logistic_5fold_Pre,naivebayes_5Fold_Acc,naivebayes_5Fold_F1,naivebayes_5Fold_Rec,naivebayes_5Fold_Pre,randomforest_5Fold_Acc,randomforest_5Fold_F1,randomforest_5Fold_Rec,randomforest_5Fold_Pre
0,0.80269,0.718678,0.716074,0.721301,0.676096,0.619035,0.747694,0.528153,0.765129,0.624397,...,0.732543,0.714478,0.677116,0.618273,0.743248,0.529307,0.770971,0.637507,0.572475,0.719289
1,0.838436,0.779279,0.812752,0.748455,0.710551,0.65253,0.774497,0.563752,0.796514,0.691429,...,0.781084,0.74603,0.707684,0.648572,0.769117,0.560772,0.796957,0.689493,0.642843,0.74352
2,0.845418,0.782784,0.796715,0.769332,0.720268,0.65968,0.775496,0.573961,0.822446,0.730964,...,0.808992,0.76853,0.731237,0.672366,0.788459,0.58613,0.825244,0.738567,0.705796,0.774678
3,0.923077,0.923923,0.921811,0.926045,0.751158,0.746818,0.72428,0.770803,0.863531,0.865801,...,0.923836,0.923268,0.753383,0.751539,0.736221,0.767551,0.866311,0.868395,0.870649,0.866235
4,0.954813,0.955597,0.956481,0.954713,0.790539,0.792541,0.787037,0.798122,0.898564,0.900988,...,0.953203,0.952394,0.791828,0.792048,0.780076,0.804419,0.894831,0.897044,0.901727,0.892472
5,0.972023,0.972644,0.975152,0.970149,0.813725,0.81492,0.804032,0.826108,0.916786,0.919482,...,0.972385,0.970658,0.811498,0.812991,0.803568,0.822752,0.917683,0.919768,0.925595,0.914058
6,0.923077,0.923853,0.920896,0.926829,0.749305,0.746604,0.728852,0.765242,0.859129,0.861503,...,0.922839,0.920756,0.750278,0.750443,0.741169,0.759997,0.863763,0.866321,0.871358,0.861384
7,0.953165,0.953903,0.953241,0.954567,0.784185,0.787781,0.787963,0.787598,0.893857,0.896631,...,0.953122,0.953771,0.787497,0.789211,0.782777,0.795773,0.892336,0.894646,0.89976,0.889607
8,0.968197,0.968903,0.971402,0.966418,0.810856,0.813751,0.810127,0.817408,0.91296,0.915663,...,0.969127,0.970931,0.807528,0.810124,0.805279,0.815089,0.915865,0.918117,0.925211,0.911151
9,0.799212,0.71438,0.713439,0.715324,0.688616,0.631752,0.758893,0.541099,0.766752,0.633382,...,0.733303,0.71369,0.686622,0.627065,0.748799,0.539429,0.770137,0.643051,0.588716,0.708571


In [77]:
# 나온 결과 저장
score_result.to_csv('result_final.csv')