In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold

In [55]:
# 임베딩 배열을 데이터프레임으로 변환
def make_df(data):

    array = data[0]     # 첫번째 array
    error = []          # 에러 인덱스 리스트


    for index in range(len(data)-1):
        plus_array = data[index+1]     # 추가할 array
        try:
            array = np.concatenate((array, plus_array), axis=0)
        except:
            print(index+1, '번 인덱스 오류 처리 완료')
            error.append(index+1)     # 에러 인덱스 저장

        # 201건부터는 모두 보이스피싱 텍스트 임베딩값
        # 보이스피싱 여부를 붙이고자 phishing_check로 인덱스 지정
        
    phishing_check = array.shape[0]/2 + 1
    #print(phishing_check)

    # 데이터프레임 변환 및 레이블 지정
    df = pd.DataFrame(array)
    df.shape
    y = pd.DataFrame(index=range(df.shape[0]),columns=['phishing'])
    df=pd.concat([df,y],axis=1)
    df['phishing'] = 0                        # 상담 텍스트 임베딩값
    df.loc[phishing_check:,'phishing'] = 1    # 보이스파싱 텍스트 임베딩값
    return df

In [29]:
def data_split(df):
    feature = df.iloc[:,:-1]      # 피처 칼럼: 예측에 사용되는 데이터
    target  = df['']      # 타겟 칼럼: 예측(분류) 대상 데이터
    x_train, x_valid, y_train, y_valid = train_test_split(feature, target, test_size=0.2,
                                                          shuffle=True,
                                                          stratify=target,
                                                          random_state=1234)
    return x_train, x_valid, y_train, y_valid, feature, target

In [4]:
# 분류 score 계산
def score(pred, y_test):
    print(' accuracy  : ', accuracy_score(y_test, pred))
    print(' f1-score  : ', f1_score(y_test, pred))
    print(' recall    : ', recall_score(y_test, pred))
    print(' precision : ', precision_score(y_test, pred))
    return accuracy_score(y_test, pred), f1_score(y_test, pred), recall_score(y_test, pred), precision_score(y_test, pred)

In [74]:
# score result에 결과 추가
def result_append(score_result, scores):
    score_result = score_result(pd.Series(scores, index=score_result.columns), ignore_index=True)
    return score_result

In [75]:
# 최종 결과 리스트
score_result = pd.DataFrame(columns=['logistic_Acc', 'logistic_F1', 'logistic_Rec', 'logistic_Pre',
                                     'naivebayes_Acc', 'naivebayes_F1', 'naivebayes_Rec', 'naivebayes_Pre',
                                     'randomforest_Acc', 'logistic_F1', 'naivebayes_Rec', 'randomforest_Pre',
                                     'logistic_5fold_Acc', 'logistic_5fold_F1', 'logistic_5fold_Rec', 'logistic_5fold_Pre',
                                     'naivebayes_5Fold_Acc', 'naivebayes_5Fold_F1', 'naivebayes_5Fold_Rec', 'naivebayes_5Fold_Pre',
                                     'randomforest_5Fold_Acc', 'randomforest_5Fold_F1', 'randomforest_5Fold_Rec', 'randomforest_5Fold_Pre'])

### Logestic Regression

In [7]:
def logistic_reg(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    reg = LogisticRegression(random_state=0, max_iter=500)
    reg.fit(x_train, y_train)
    pred = reg.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### Naive Bayes Classification


In [57]:


def naivebayes_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    pred = gnb.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### RandomForest Classification

In [58]:
def randomforest_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

### K-Fold Cross Validation

In [59]:
def kfold_clf(model, feature, target):

    # 5개 폴드세트를 분리하는 kFold 객체와
    # 폴드세트별 score 값을 담을 리스트
    kfold = KFold(n_splits=5, random_state=0, shuffle=True)
    cv_accuracy = []
    cv_f1score = []
    cv_precision = []
    cv_recall = []


    n_iter = 0
    for train_index, validate_index in kfold.split(feature):
        # 데이터셋 지정
        X_train, X_valid = feature.iloc[train_index,:], feature.iloc[validate_index,:]
        y_train, y_valid = target[train_index], target[validate_index]


        # 학습 및 예측
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        n_iter += 1


        # 데이터 size 확인
        train_size = X_train.shape[0]
        test_size = X_valid.shape[0]
        print(f' ========= {n_iter} =========')


        # iteration마다 score 측정
        accuracy,f1score,precision, recall = score(pred, y_valid)


        # iteration 별 score 값 저장
        cv_accuracy.append(accuracy)
        cv_f1score.append(f1score)
        cv_precision.append(precision)
        cv_recall.append(recall)


    # 개별 iteration별 정확도를 합한 평균 계산
    print(' ======== 최종 ========')
    print(' 평균검증 정확도   : ', np.mean(cv_accuracy))
    print(' 평균검증 f1-score : ', np.mean(cv_f1score))
    print(' 평균검증 정밀도   : ', np.mean(cv_precision))
    print(' 평균검증 재현율   : ', np.mean(cv_recall))
    print(' \n')
    return np.mean(cv_accuracy), np.mean(cv_f1score), np.mean(cv_precision), np.mean(cv_recall)

### 전체 분류 모델 통합

In [60]:
def total_model(data):

    # 임베딩 배열을 데이터프레임으로 전환
    df = make_df(data)
    # train / test 데이터 분할
    x_train, x_valid, y_train, y_valid, feature, target = data_split(df)
    # 리턴할 점수 리스트
    score_list = []
    print(' 데이터 가공 완료\n')


    # Logistic Regression
    print('      ----------    Logistic Regression Result    ----------      ')
    logi1, logi2, logi3, logi4 = logistic_reg(x_train, y_train, x_valid, y_valid)
    score_list += [logi1, logi2, logi3, logi4]


    # Naive Bayes Classification
    print('      ---------- Naive Bayes Classification Result ----------      ')
    naive1, naive2, naive3, naive4 = naivebayes_clf(x_train, y_train, x_valid, y_valid)
    score_list += [naive1, naive2, naive3, naive4]


    # RandomForest Classification
    print('      ---------- RandomForest Classifcation Result ----------      ')
    rf1, rf2, rf3, rf4 = randomforest_clf(x_train, y_train, x_valid, y_valid)
    score_list += [rf1, rf2, rf3, rf4]


    # 5-Fold Logistic Regression
    print(' \n')
    print('   ----------    [5-Fold] Logistic Regression Result    ----------   ')
    lgb = LogisticRegression(random_state=0, max_iter=500)
    foldlogi1, foldlogi2, foldlogi3, foldlogi4 = kfold_clf(lgb, feature, target)
    score_list += [foldlogi1, foldlogi2, foldlogi3, foldlogi4]


    # 5-Fold NaiveBayes
    print('   ---------- [5-Fold] Naive Bayes Classification Result ----------   ')
    gnb = GaussianNB()
    foldnb1, foldnb2, foldnb3, foldnb4 = kfold_clf(gnb, feature, target)
    score_list += [foldnb1, foldnb2, foldnb3, foldnb4]


    # 5-Fold RandomForest Classification
    print('   ---------- [5-Fold] RandomForest Classifcation Result ----------   ')
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    foldrf1, foldrf2, foldrf3, foldrf4 = kfold_clf(clf, feature, target)
    score_list += [foldrf1, foldrf2, foldrf3, foldrf4]
    return score_list

### 모델 이용 데이터셋 분류: original

In [61]:
# 데이터 로드
origin_2gram = np.load('/Users/withmocha/Desktop/DATA/Capston Design(2024)/data(sentence)/n-gram/original/original_2gram_embedding.npy', allow_pickle=True)
origin_3gram = np.load('/Users/withmocha/Desktop/DATA/Capston Design(2024)/data(sentence)/n-gram/original/original_3gram_embedding.npy', allow_pickle=True)
origin_4gram = np.load('/Users/withmocha/Desktop/DATA/Capston Design(2024)/data(sentence)/n-gram/original/original_4gram_embedding.npy', allow_pickle=True)

In [54]:
data=origin_2gram


array = data[0]     # 첫번째 array
error = []          # 에러 인덱스 리스트


for index in range(len(data)-1):
    plus_array = data[index+1]     # 추가할 array
    try:
        array = np.concatenate((array, plus_array), axis=0)
    except:
        print(index+1, '번 인덱스 오류 처리 완료')
        error.append(index+1)     # 에러 인덱스 저장

        # 201건부터는 모두 보이스피싱 텍스트 임베딩값
        # 보이스피싱 여부를 붙이고자 phishing_check로 인덱스 지정
        
    phishing_check = array.shape[0] + 1
    

    # 데이터프레임 변환 및 레이블 지정
    df = pd.DataFrame(array)
    df.shape
    y = pd.DataFrame(index=range(df.shape[0]),columns=['phishing'])
    df=pd.concat([df,y],axis=1)
    df['phishing'] = 0                        # 상담 텍스트 임베딩값
    df.loc[phishing_check:,'phishing'] = 1    # 보이스파싱 텍스트 임베딩값
    

15
20


In [76]:
# original_2gram
ori_2g = total_model(origin_2gram)
score_result = result_append(score_result, ori_2g)

 데이터 가공 완료

      ----------    Logistic Regression Result    ----------      
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
      ---------- Naive Bayes Classification Result ----------      
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
      ---------- RandomForest Classifcation Result ----------      
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 

   ----------    [5-Fold] Logistic Regression Result    ----------   
 accuracy  :  1.0
 f1-score  :  1.0
 recall    :  1.0
 precision :  1.0
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.25
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.3333333333333333
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 평균검증 정확도   :  0.5166666666666667
 평균검증 f1-score :  0.2
 평균검증 정밀도   :  0.2
 평균검증 재현율   :  0.2
 

   ---------- [5-Fold] 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.25
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.5
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 accuracy  :  0.3333333333333333
 f1-score  :  0.0
 recall    :  0.0
 precision :  0.0
 평균검증 정확도   :  0.36666666666666664
 평균검증 f1-score :  0.08
 평균검증 정밀도   :  0.2
 평균검증 재현율   :  0.05
 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TypeError: 'DataFrame' object is not callable

In [70]:
score_result

Unnamed: 0,0
logistic_Acc,0.5
logistic_F1,0.0
logistic_Rec,0.0
logistic_Pre,0.0
naivebayes_Acc,0.5
naivebayes_F1,0.0
naivebayes_Rec,0.0
naivebayes_Pre,0.0
randomforest_Acc,0.5
logistic_F1,0.0
