### 기본 세팅

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression    
from sklearn.naive_bayes import GaussianNB          
from sklearn.ensemble import RandomForestClassifier    

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold

In [2]:
# 임베딩 배열을 데이터프레임으로 변환
def make_df(data):
    
    array = data[0]     # 첫번째 array
    error = []          # 에러 인덱스 리스트
    

    for index in range(len(data)-1):
        plus_array = data[index+1]     # 추가할 array
        try:
            array = np.concatenate((array, plus_array), axis=0) 
        except:
            print(index+1, '번 인덱스 오류 처리 완료')
            error.append(index+1)     # 에러 인덱스 저장
            
        # 201건부터는 모두 보이스피싱 텍스트 임베딩값
        # 보이스피싱 여부를 붙이고자 phishing_check로 인덱스 지정
        if (index+1) == 199:
            phishing_check = array.shape[0] + 1

    
    # 데이터프레임 변환 및 레이블 지정
    df = pd.DataFrame(array)
    df['phishing'] = 0                        # 상담 텍스트 임베딩값
    df.loc[phishing_check:,'phishing'] = 1    # 보이스파싱 텍스트 임베딩값
    return df

### train, validation 분할

In [3]:
def data_split(df):
    feature = df.iloc[:,:-1]      # 피처 칼럼: 예측에 사용되는 데이터
    target  = df['phishing']      # 타겟 칼럼: 예측(분류) 대상 데이터
    x_train, x_valid, y_train, y_valid = train_test_split(feature, target, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target,  
                                                          random_state=1234) 
    return x_train, x_valid, y_train, y_valid, feature, target

### 분류 모델 세팅

In [4]:
# 분류 score 계산
def score(pred, y_test):
    print(' accuracy  : ', accuracy_score(y_test, pred))
    print(' f1-score  : ', f1_score(y_test, pred))
    print(' recall    : ', recall_score(y_test, pred))
    print(' precision : ', precision_score(y_test, pred))
    return accuracy_score(y_test, pred), f1_score(y_test, pred), recall_score(y_test, pred), precision_score(y_test, pred)

In [5]:
# score result에 결과 추가
def result_append(score_result, scores):
    score_result = score_result.append(pd.Series(scores, index=score_result.columns), ignore_index=True)
    return score_result

In [6]:
# 최종 결과 리스트
score_result = pd.DataFrame(columns=['logistic_Acc', 'logistic_F1', 'logistic_Rec', 'logistic_Pre', 
                                     'naivebayes_Acc', 'naivebayes_F1', 'naivebayes_Rec', 'naivebayes_Pre',
                                     'randomforest_Acc', 'logistic_F1', 'naivebayes_Rec', 'randomforest_Pre',
                                     'logistic_5fold_Acc', 'logistic_5fold_F1', 'logistic_5fold_Rec', 'logistic_5fold_Pre',
                                     'naivebayes_5Fold_Acc', 'naivebayes_5Fold_F1', 'naivebayes_5Fold_Rec', 'naivebayes_5Fold_Pre',
                                     'randomforest_5Fold_Acc', 'randomforest_5Fold_F1', 'randomforest_5Fold_Rec', 'randomforest_5Fold_Pre'])

### Logistic Regression

In [7]:
def logistic_reg(x_train, y_train, x_valid, y_valid):
    
    # 모델 분류 수행
    reg = LogisticRegression(random_state=0, max_iter=500)
    reg.fit(x_train, y_train)
    pred = reg.predict(x_valid)
    
    
    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### Naive Bayes Classification

In [8]:
def naivebayes_clf(x_train, y_train, x_valid, y_valid):
    
    # 모델 분류 수행
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    pred = gnb.predict(x_valid)
    
    
    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### RandomForest Classification

In [9]:
def randomforest_clf(x_train, y_train, x_valid, y_valid):
    
    # 모델 분류 수행
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_valid)
    
    
    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### K-Fold Cross Validation 

In [10]:
def kfold_clf(model, feature, target):

    # 5개 폴드세트를 분리하는 kFold 객체와
    # 폴드세트별 score 값을 담을 리스트
    kfold = KFold(n_splits=5, random_state=0, shuffle=True)
    cv_accuracy = []
    cv_f1score = []
    cv_precision = []
    cv_recall = []
    
    
    n_iter = 0   
    for train_index, validate_index in kfold.split(feature):
        # 데이터셋 지정
        X_train, X_valid = feature.iloc[train_index,:], feature.iloc[validate_index,:]
        y_train, y_valid = target[train_index], target[validate_index]
        
        
        # 학습 및 예측
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        n_iter += 1
        
        
        # 데이터 size 확인
        train_size = X_train.shape[0]
        test_size = X_valid.shape[0]
        print(f' ========= {n_iter} =========')
        
        
        # iteration마다 score 측정
        accuracy,f1score,precision, recall = score(pred, y_valid)   
        
        
        # iteration 별 score 값 저장
        cv_accuracy.append(accuracy)
        cv_f1score.append(f1score)
        cv_precision.append(precision)
        cv_recall.append(recall)
        
        
    # 개별 iteration별 정확도를 합한 평균 계산
    print(' ======== 최종 ========')
    print(' 평균검증 정확도   : ', np.mean(cv_accuracy))
    print(' 평균검증 f1-score : ', np.mean(cv_f1score))
    print(' 평균검증 정밀도   : ', np.mean(cv_precision))
    print(' 평균검증 재현율   : ', np.mean(cv_recall))
    print(' \n')
    return np.mean(cv_accuracy), np.mean(cv_f1score), np.mean(cv_precision), np.mean(cv_recall)

### 전체 분류 모델 통합

In [11]:
def total_model(data):
    
    # 임베딩 배열을 데이터프레임으로 전환
    df = make_df(data)
    # train / test 데이터 분할
    x_train, x_valid, y_train, y_valid, feature, target = data_split(df)
    # 리턴할 점수 리스트
    score_list = []
    print(' 데이터 가공 완료\n')
    
    
    # Logistic Regression
    print('      ----------    Logistic Regression Result    ----------      ')
    logi1, logi2, logi3, logi4 = logistic_reg(x_train, y_train, x_valid, y_valid)
    score_list += [logi1, logi2, logi3, logi4]
    
    
    # Naive Bayes Classification
    print('      ---------- Naive Bayes Classification Result ----------      ')
    naive1, naive2, naive3, naive4 = naivebayes_clf(x_train, y_train, x_valid, y_valid)
    score_list += [naive1, naive2, naive3, naive4]
    
    
    # RandomForest Classification
    print('      ---------- RandomForest Classifcation Result ----------      ')
    rf1, rf2, rf3, rf4 = randomforest_clf(x_train, y_train, x_valid, y_valid)
    score_list += [rf1, rf2, rf3, rf4]
    
    
    # 5-Fold Logistic Regression
    print(' \n')
    print('   ----------    [5-Fold] Logistic Regression Result    ----------   ')
    lgb = LogisticRegression(random_state=0, max_iter=500)
    foldlogi1, foldlogi2, foldlogi3, foldlogi4 = kfold_clf(lgb, feature, target)
    score_list += [foldlogi1, foldlogi2, foldlogi3, foldlogi4]
    
    
    # 5-Fold NaiveBayes
    print('   ---------- [5-Fold] Naive Bayes Classification Result ----------   ')
    gnb = GaussianNB()
    foldnb1, foldnb2, foldnb3, foldnb4 = kfold_clf(gnb, feature, target)
    score_list += [foldnb1, foldnb2, foldnb3, foldnb4]
    
    
    # 5-Fold RandomForest Classification 
    print('   ---------- [5-Fold] RandomForest Classifcation Result ----------   ')
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    foldrf1, foldrf2, foldrf3, foldrf4 = kfold_clf(clf, feature, target)
    score_list += [foldrf1, foldrf2, foldrf3, foldrf4]
    return score_list

### 모델 이용 데이터셋 분류: original

In [12]:
# 데이터 로드
origin_2gram = np.load('./save_embeddings/original_2gram_embedding.npy', allow_pickle=True)
origin_3gram = np.load('./save_embeddings/original_3gram_embedding.npy', allow_pickle=True)
origin_4gram = np.load('./save_embeddings/original_4gram_embedding.npy', allow_pickle=True)

In [13]:
# original_2gram
ori_2g = total_model(origin_2gram)
score_result = result_append(score_result, ori_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9263206672845227
 f1-score  :  0.9268629254829807
 recall    :  0.9213534522176497
 precision :  0.9324386857936141
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7506950880444856
 f1-score  :  0.74344301382928
 recall    :  0.7128486511202561
 precision :  0.7767812655705032
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.871177015755329
 f1-score  :  0.8735213830755232
 recall    :  0.877914951989026
 precision :  0.8691715708465368
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9244670991658943
 f1-score  :  0.9249884951679705
 recall    :  0.9186471663619744
 precision :  0.9314179796107507
 accuracy  :  0.9263206672845227
 f1-score  :  0.9275956284153004
 recall    :  0.9280182232346241
 precision :  0.9271734182976786

In [14]:
# original_3gram
ori_3g = total_model(origin_3gram)
score_result = result_append(score_result, ori_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9621087314662273
 f1-score  :  0.9627917725907095
 recall    :  0.9643518518518519
 precision :  0.9612367328103368
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7844198634972935
 f1-score  :  0.7802303262955854
 recall    :  0.7527777777777778
 precision :  0.8097609561752988
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9065662508825606
 f1-score  :  0.9090075636030254
 recall    :  0.9180555555555555
 precision :  0.9001361779391739
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9557542951282655
 f1-score  :  0.9562179785747554
 recall    :  0.9606925596630791
 precision :  0.951784886416319
 accuracy  :  0.9559896446222641
 f1-score  :  0.9573934837092731
 reca

In [15]:
# original_4gram
ori_4g = total_model(origin_4gram)
score_result = result_append(score_result, ori_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9748923959827833
 f1-score  :  0.9754271003978469
 recall    :  0.9770276605719643
 precision :  0.9738317757009346
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.8142037302725968
 f1-score  :  0.8144256030570813
 recall    :  0.7993436474449133
 precision :  0.8300876338851022
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9184600669536107
 f1-score  :  0.9212289212289213
 recall    :  0.9348335677449602
 precision :  0.9080145719489982
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9727403156384505
 f1-score  :  0.9730113636363636
 recall    :  0.9748576850094877
 precision :  0.97117202268431
 accuracy  :  0.9803921568627451
 

In [16]:
# 나온 결과 확인
score_result

Unnamed: 0,logistic_Acc,logistic_F1,logistic_Rec,logistic_Pre,naivebayes_Acc,naivebayes_F1,naivebayes_Rec,naivebayes_Pre,randomforest_Acc,logistic_F1.1,...,logistic_5fold_Rec,logistic_5fold_Pre,naivebayes_5Fold_Acc,naivebayes_5Fold_F1,naivebayes_5Fold_Rec,naivebayes_5Fold_Pre,randomforest_5Fold_Acc,randomforest_5Fold_F1,randomforest_5Fold_Rec,randomforest_5Fold_Pre
0,0.926321,0.926863,0.921353,0.932439,0.750695,0.743443,0.712849,0.776781,0.871177,0.873521,...,0.925496,0.927952,0.746664,0.739991,0.711701,0.770751,0.870204,0.872723,0.878206,0.867333
1,0.962109,0.962792,0.964352,0.961237,0.78442,0.78023,0.752778,0.809761,0.906566,0.909008,...,0.960268,0.958191,0.781424,0.77765,0.752052,0.805129,0.901422,0.904092,0.914333,0.894107
2,0.974892,0.975427,0.977028,0.973832,0.814204,0.814426,0.799344,0.830088,0.91846,0.921229,...,0.975136,0.976023,0.812311,0.811061,0.790231,0.833167,0.928684,0.930973,0.943595,0.918702


### 모델 이용 데이터셋 분류: eng

In [17]:
# 데이터 로드
eng_ratio5_2gram = np.load('./save_embeddings/eng_ratio5_2gram_embedding.npy', allow_pickle=True)
eng_ratio5_3gram = np.load('./save_embeddings/eng_ratio5_3gram_embedding.npy', allow_pickle=True)
eng_ratio5_4gram = np.load('./save_embeddings/eng_ratio5_4gram_embedding.npy', allow_pickle=True)

eng_ratio10_2gram = np.load('./save_embeddings/eng_ratio10_2gram_embedding.npy', allow_pickle=True)
eng_ratio10_3gram = np.load('./save_embeddings/eng_ratio10_3gram_embedding.npy', allow_pickle=True)
eng_ratio10_4gram = np.load('./save_embeddings/eng_ratio10_4gram_embedding.npy', allow_pickle=True)

In [18]:
# eng_ratio5_2gram
eng_r5_2g = total_model(eng_ratio5_2gram)
score_result = result_append(score_result, eng_r5_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9230769230769231
 f1-score  :  0.9239230064161321
 recall    :  0.9218106995884774
 precision :  0.9260450160771704
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7511584800741428
 f1-score  :  0.7468175388967467
 recall    :  0.7242798353909465
 precision :  0.7708029197080292
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8635310472659871
 f1-score  :  0.8658008658008658
 recall    :  0.8687700045724737
 precision :  0.8628519527702089
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9219184430027804
 f1-score  :  0.9225821272685505
 recall    :  0.9177330895795247
 precision :  0.9274826789838337
 accuracy  :  0.9230769230769231
 f1-score  :  0.9246139872842869
 recall    :  0.9275626423690205
 precision :  0.921684019918

In [19]:
# eng_ratio5_3gram
eng_r5_3g = total_model(eng_ratio5_3gram)
score_result = result_append(score_result, eng_r5_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9548128971522711
 f1-score  :  0.9555966697502313
 recall    :  0.9564814814814815
 precision :  0.9547134935304991
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7905389503412568
 f1-score  :  0.7925407925407926
 recall    :  0.7870370370370371
 precision :  0.7981220657276995
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8985643680866087
 f1-score  :  0.9009878244888583
 recall    :  0.9078703703703703
 precision :  0.8942088463292294
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9416333254883502
 f1-score  :  0.9422180801491147
 recall    :  0.9461862423958821
 precision :  0.9382830626450116
 accuracy  :  0.9510473052482937
 f1-score  :  0.9523809523809523
 rec

In [20]:
# eng_ratio5_4gram
eng_r5_4g = total_model(eng_ratio5_4gram)
score_result = result_append(score_result, eng_r5_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9720229555236729
 f1-score  :  0.9726443768996961
 recall    :  0.9751523675574308
 precision :  0.9701492537313433
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.8137254901960784
 f1-score  :  0.814920408648135
 recall    :  0.804031879981247
 precision :  0.8261078998073218
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9167862266857962
 f1-score  :  0.9194817214252661
 recall    :  0.9315518049695265
 precision :  0.9077204202832344
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.966284074605452
 f1-score  :  0.966579758236549
 recall    :  0.9672675521821632
 precision :  0.9658929417337755
 accuracy  :  0.9751315160210425
 f1

In [21]:
# eng_ratio10_2gram
eng_r10_2g = total_model(eng_ratio10_2gram)
score_result = result_append(score_result, eng_r10_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9230769230769231
 f1-score  :  0.9238532110091744
 recall    :  0.9208962048468221
 precision :  0.926829268292683
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7493049119555144
 f1-score  :  0.7466042154566745
 recall    :  0.7288523090992227
 precision :  0.7652424387902065
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8591288229842446
 f1-score  :  0.8615034168564921
 recall    :  0.8646547782350251
 precision :  0.8583749432591921
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9212233549582948
 f1-score  :  0.9219108865411115
 recall    :  0.9172760511882998
 precision :  0.9265927977839336
 accuracy  :  0.9198331788693235
 f1-score  :  0.9213636363636364
 recall    :  0.9234624145785877
 precision :  0.9192743764172

In [22]:
# eng_ratio10_3gram
eng_r10_3g = total_model(eng_ratio10_3gram)
score_result = result_append(score_result, eng_r10_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.953165450694281
 f1-score  :  0.9539031735001158
 recall    :  0.9532407407407407
 precision :  0.9545665275846082
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7841845140032949
 f1-score  :  0.7877806063411248
 recall    :  0.787962962962963
 precision :  0.7875983341045812
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8938573782066369
 f1-score  :  0.8966307586523035
 recall    :  0.9055555555555556
 precision :  0.887880163413527
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9456342668863262
 f1-score  :  0.9459901800327332
 recall    :  0.9466541881141788
 precision :  0.9453271028037383
 accuracy  :  0.9510473052482937
 f1-score  :  0.9524680073126143
 recall

In [23]:
# eng_ratio10_4gram
eng_r10_4g = total_model(eng_ratio10_4gram)
score_result = result_append(score_result, eng_r10_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9681970349115255
 f1-score  :  0.9689034369885433
 recall    :  0.9714017815283638
 precision :  0.9664179104477612
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.810856049736968
 f1-score  :  0.8137508829762186
 recall    :  0.810126582278481
 precision :  0.8174077578051088
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.912960306073649
 f1-score  :  0.9156626506024096
 recall    :  0.9263947491795593
 precision :  0.9051763628034815
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9672405547584888
 f1-score  :  0.9675124496087266
 recall    :  0.967741935483871
 precision :  0.9672830725462305
 accuracy  :  0.9734576757532282
 f1

In [24]:
# 나온 결과 확인
score_result

Unnamed: 0,logistic_Acc,logistic_F1,logistic_Rec,logistic_Pre,naivebayes_Acc,naivebayes_F1,naivebayes_Rec,naivebayes_Pre,randomforest_Acc,logistic_F1.1,...,logistic_5fold_Rec,logistic_5fold_Pre,naivebayes_5Fold_Acc,naivebayes_5Fold_F1,naivebayes_5Fold_Rec,naivebayes_5Fold_Pre,randomforest_5Fold_Acc,randomforest_5Fold_F1,randomforest_5Fold_Rec,randomforest_5Fold_Pre
0,0.926321,0.926863,0.921353,0.932439,0.750695,0.743443,0.712849,0.776781,0.871177,0.873521,...,0.925496,0.927952,0.746664,0.739991,0.711701,0.770751,0.870204,0.872723,0.878206,0.867333
1,0.962109,0.962792,0.964352,0.961237,0.78442,0.78023,0.752778,0.809761,0.906566,0.909008,...,0.960268,0.958191,0.781424,0.77765,0.752052,0.805129,0.901422,0.904092,0.914333,0.894107
2,0.974892,0.975427,0.977028,0.973832,0.814204,0.814426,0.799344,0.830088,0.91846,0.921229,...,0.975136,0.976023,0.812311,0.811061,0.790231,0.833167,0.928684,0.930973,0.943595,0.918702
3,0.923077,0.923923,0.921811,0.926045,0.751158,0.746818,0.72428,0.770803,0.863531,0.865801,...,0.923836,0.923186,0.753383,0.751539,0.736221,0.767551,0.866311,0.868395,0.870649,0.866235
4,0.954813,0.955597,0.956481,0.954713,0.790539,0.792541,0.787037,0.798122,0.898564,0.900988,...,0.953203,0.952394,0.791828,0.792048,0.780076,0.804419,0.894831,0.897044,0.901727,0.892472
5,0.972023,0.972644,0.975152,0.970149,0.813725,0.81492,0.804032,0.826108,0.916786,0.919482,...,0.972385,0.970658,0.811498,0.812991,0.803568,0.822752,0.917683,0.919768,0.925595,0.914058
6,0.923077,0.923853,0.920896,0.926829,0.749305,0.746604,0.728852,0.765242,0.859129,0.861503,...,0.922839,0.920756,0.750278,0.750443,0.741169,0.759997,0.863763,0.866321,0.871358,0.861384
7,0.953165,0.953903,0.953241,0.954567,0.784185,0.787781,0.787963,0.787598,0.893857,0.896631,...,0.953122,0.953771,0.787497,0.789211,0.782777,0.795773,0.892336,0.894646,0.89976,0.889607
8,0.968197,0.968903,0.971402,0.966418,0.810856,0.813751,0.810127,0.817408,0.91296,0.915663,...,0.969127,0.970931,0.807528,0.810124,0.805279,0.815089,0.915865,0.918117,0.925211,0.911151


### 모델 이용 데이터셋 분류: kor

In [25]:
# 데이터 로드
kor_ratio5_2gram = np.load('./save_embeddings/kor_ratio5_2gram_embedding.npy', allow_pickle=True)
kor_ratio5_3gram = np.load('./save_embeddings/kor_ratio5_3gram_embedding.npy', allow_pickle=True)
kor_ratio5_4gram = np.load('./save_embeddings/kor_ratio5_4gram_embedding.npy', allow_pickle=True)

kor_ratio10_2gram = np.load('./save_embeddings/kor_ratio10_2gram_embedding.npy', allow_pickle=True)
kor_ratio10_3gram = np.load('./save_embeddings/kor_ratio10_3gram_embedding.npy', allow_pickle=True)
kor_ratio10_4gram = np.load('./save_embeddings/kor_ratio10_4gram_embedding.npy', allow_pickle=True)

In [26]:
# kor_ratio5_2gram
kor_r5_2g = total_model(kor_ratio5_2gram)
score_result = result_append(score_result, kor_r5_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9246987951807228
 f1-score  :  0.9256122682536049
 recall    :  0.9245541838134431
 precision :  0.9266727772685609
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7613531047265987
 f1-score  :  0.7546450690805145
 recall    :  0.7242798353909465
 precision :  0.7876678269517653
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8679332715477294
 f1-score  :  0.8702185792349727
 recall    :  0.8737997256515775
 precision :  0.8666666666666667
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9223818350324374
 f1-score  :  0.9230769230769231
 recall    :  0.9186471663619744
 precision :  0.9275496077526535
 accuracy  :  0.9253938832252085
 f1-score  :  0.9264840182648403
 recall    :  0.924373576309795
 precision :  0.9286041189931

In [27]:
# kor_ratio5_3gram
kor_r5_3g = total_model(kor_ratio5_3gram)
score_result = result_append(score_result, kor_r5_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9564603436102612
 f1-score  :  0.9572056442285449
 recall    :  0.9578703703703704
 precision :  0.9565418400369856
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7971287361732172
 f1-score  :  0.7971764705882353
 recall    :  0.7842592592592592
 precision :  0.8105263157894737
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9006825135325959
 f1-score  :  0.9030330882352942
 recall    :  0.9097222222222222
 precision :  0.896441605839416
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9489291598023064
 f1-score  :  0.9493582263710618
 recall    :  0.9518015910154423
 precision :  0.946927374301676
 accuracy  :  0.9536361496822782
 f1-score  :  0.9548475819390327
 recal

In [28]:
# kor_ratio5_4gram
kor_r5_4g = total_model(kor_ratio5_4gram)
score_result = result_append(score_result, kor_r5_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9717838354854137
 f1-score  :  0.9723134678554668
 recall    :  0.9714017815283638
 precision :  0.9732268670737435
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.8311812529890005
 f1-score  :  0.8319047619047619
 recall    :  0.8190342240975153
 precision :  0.8451862602806
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9230033476805356
 f1-score  :  0.9251858736059481
 recall    :  0.93342709798406
 precision :  0.9170888991248273
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9708273553323769
 f1-score  :  0.9710351377018044
 recall    :  0.9701138519924098
 precision :  0.971958174904943
 accuracy  :  0.9780009564801531
 f1-s

In [29]:
# kor_ratio10_2gram
kor_r10_2g = total_model(kor_ratio10_2gram)
score_result = result_append(score_result, kor_r10_2g)

304 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9230769230769231
 f1-score  :  0.9237482774460266
 recall    :  0.9195244627343393
 precision :  0.928011075219197
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7634383688600556
 f1-score  :  0.7569626279457272
 recall    :  0.7270233196159122
 precision :  0.7894736842105263
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8642261353104727
 f1-score  :  0.8662711090826107
 recall    :  0.8678555098308185
 precision :  0.8646924829157175
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9214550509731233
 f1-score  :  0.9223367697594502
 recall    :  0.920018281535649
 precision :  0.9246669728984842
 accuracy  :  0.9240037071362373
 f1-score  :  0.9252165982672138
 recall    :  0.924373576309795
 precision :  0.926061159287996

In [30]:
# kor_ratio10_3gram
kor_r10_3g = total_model(kor_ratio10_3gram)
score_result = result_append(score_result, kor_r10_3g)

264 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9552835961402683
 f1-score  :  0.9561403508771931
 recall    :  0.9587962962962963
 precision :  0.9534990791896869
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.7940691927512356
 f1-score  :  0.7938751472320378
 recall    :  0.7800925925925926
 precision :  0.8081534772182254
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.8990350670746058
 f1-score  :  0.9013566337088986
 recall    :  0.9074074074074074
 precision :  0.8953860210141618
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9489291598023064
 f1-score  :  0.9493582263710618
 recall    :  0.9518015910154423
 precision :  0.946927374301676
 accuracy  :  0.9559896446222641
 f1-score  :  0.9572180279112332
 reca

In [31]:
# kor_ratio10_4gram
kor_r10_4g = total_model(kor_ratio10_4gram)
score_result = result_append(score_result, kor_r10_4g)

264 번 인덱스 오류 처리 완료
291 번 인덱스 오류 처리 완료
304 번 인덱스 오류 처리 완료
316 번 인덱스 오류 처리 완료
330 번 인덱스 오류 처리 완료
331 번 인덱스 오류 처리 완료
332 번 인덱스 오류 처리 완료
334 번 인덱스 오류 처리 완료
 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.9717838354854137
 f1-score  :  0.9723134678554668
 recall    :  0.9714017815283638
 precision :  0.9732268670737435
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.8280726924916308
 f1-score  :  0.8287687544653488
 recall    :  0.8157524613220816
 precision :  0.8422071636011617
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.9239598278335724
 f1-score  :  0.926046511627907
 recall    :  0.93342709798406
 precision :  0.9187817258883249
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  0.9681970349115255
 f1-score  :  0.9684310467600284
 recall    :  0.967741935483871
 precision :  0.9691211401425178
 accuracy  :  0.9770444763271162
 f1

### score 파일 저장

In [32]:
# 나온 결과 저장
score_result.to_csv('result_final.csv')