#### 1. WE 준비

In [1]:
import nltk
import time
import string
import numpy as np
import pandas as pd
from typing import Dict

from SMTPSender import SMTPSender
SMTPSender.load_auth()

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
import xgboost as xgb
from sklearn.svm import SVC
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score

# 특정 단어에 대한 임베딩 얻기
def get_embbeding_of_a_word(target_word: str, tokenizer, outputs, tokens):
    try:
        word_index = tokens['input_ids'][0].tolist().index(tokenizer.convert_tokens_to_ids(target_word))
    
        return outputs.last_hidden_state[:, word_index, :]
    except Exception as exp:
        return None
    
#################################3
from gensim.models import KeyedVectors
import gensim.downloader as api

# 구글의 Word2Vec 모델 다운로드
# w2v_model = api.load("word2vec-google-news-300")
# print(f"{'Downloaded':=^30}")

# # 모델을 로컬에 저장
# w2v_model.save('/home/hooni/gensim-data/word2vec-google-news-300/word2vec-google-news-300')
# print(f"{'Saved':=^30}")
#####################################


# Word2Vec 모델 생성
w2v_vector_size = 300

# Pre-trained 모델
# model = w2v_model
w2v_model_path = '/home/hooni/gensim-data/word2vec-google-news-300/word2vec-google-news-300'
w2v_model = KeyedVectors.load(w2v_model_path)
print(f"{'W2V Loaded':=^30}")

bert_vector_size = 768

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
print(f"{'BERT Loaded':=^30}")



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




##### 1-1. 데이터셋 로드

In [2]:
# 데이터셋 로드
total_file_path = "data/dataset.txt"
total_documents = []
total_document = []

with open(total_file_path, 'r', encoding='utf-8') as file:
    # Postmortum 읽기
    for line in file:
        # Document separation
        if line == '\n':
            total_documents.append('\n'.join(total_document))
            total_document = []

        # Add to document
        total_document.append(line)

    # For last game document
    total_documents.append('\n'.join(total_document))

total_documents = np.array(total_documents).reshape(-1, 1)

# 목표값 로드
file_path = 'data/classification.txt'
class_list = []

class_size = 28   ### 29 -> 28
with open(file_path, 'r') as file:
    for line in file:
        class_type = str(line.strip())
        class_list.append(class_type)
        
class_list = np.array(class_list)
ohe = OneHotEncoder(sparse=False)
le = LabelEncoder()
le_label = le.fit_transform(class_list).reshape(-1, 1)
ohe_label = ohe.fit_transform(class_list)

# 데이터 병합
label_size = le_label.shape[1] + ohe_label.shape[1]
data_index = np.arange(1, total_documents.shape[0] + 1).reshape(-1, 1)
data = np.hstack((data_index, total_documents, le_label, ohe_label))



##### 1-2. 데이터 분할

In [3]:
# 데이터셋 분할
test_ratio = 0.2
index_size = 1
X_train, X_test, y_train, y_test = train_test_split(data[:, :-label_size],
                                                    data[:, -label_size:],
                                                    test_size=test_ratio,
                                                    stratify=data[:, -label_size:-label_size + le_label.shape[1]],
                                                    random_state=0)

# 자료형 변환
X_train_index, X_test_index = X_train[:, :index_size].astype(int).flatten(), X_test[:, :index_size].astype(int).flatten()
X_train, X_test = X_train[:, index_size:], X_test[:, index_size:]
X_train = X_train.flatten()
X_test = X_test.flatten()
y_train_ohe = y_train[:, le_label.shape[1]:].astype(np.float64)
y_test_ohe = y_test[:, le_label.shape[1]:].astype(np.float64)
y_train_le = y_train[:, :le_label.shape[1]].astype(np.float64).flatten()
y_test_le = y_test[:, :le_label.shape[1]].astype(np.float64).flatten()

##### 1-3. 워드 임베딩

###### a. Tf-Idf

In [4]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_X_train = tfidf_vectorizer.fit_transform(X_train).toarray()
print(f'{tfidf_X_train.shape=}')

tfidf_X_test = tfidf_vectorizer.transform(X_test).toarray()
print(f'{tfidf_X_test.shape=}')

tfidf_X_train.shape=(534, 4513)
tfidf_X_test.shape=(134, 4513)


###### b. Word2Vec

In [5]:
stop_words = set(stopwords.words('english'))

# 워드 임베딩
w2v_weird_document_index = []
total_document_vectors = np.array([]).reshape(0, w2v_vector_size)
for document_index, document in enumerate(np.concatenate((X_train, X_test))):
    # 한 게임에 대한 임베딩
    document_vectors = np.array([]).reshape(0, w2v_vector_size)
    
    # Tokenize
    tokens = str(document).lower().strip().split()  # 문장을 공백으로 나누어 단어 리스트로 변환

    # Preprocess
    tokens = [word for word in tokens if word.isalpha()]  # 문자열만 포함
    tokens = [word for word in tokens if word not in stop_words]  # 불용어 제거
    tokens = [word for word in tokens if word not in string.punctuation]  # 구두점 제거
    
    # 게임 내 한 단어에 대한 임베딩
    count = 0
    for word in tokens:
        if word in w2v_model:
            document_vectors = np.vstack((document_vectors, w2v_model[word]))
            count += 1

    if count > 0:
        # Feature vector of a document (game)
        document_vectors = document_vectors.mean(axis=0)

        # Append to total document vector
        total_document_vectors = np.vstack((total_document_vectors, document_vectors))
    else:
        print(f'[#{document_index}] Weird paragraph: {document}')
        w2v_weird_document_index.append(document_index)

w2v_weird_document_index = np.array(w2v_weird_document_index)
w2v_X_train = total_document_vectors[:X_train.shape[0] - w2v_weird_document_index[w2v_weird_document_index < X_train.shape[0]].shape[0]]
w2v_X_test = total_document_vectors[X_train.shape[0] - w2v_weird_document_index[w2v_weird_document_index < X_train.shape[0]].shape[0]:]

# Debug
print('Job done')

[#71] Weird paragraph: 

What did we do?

[#74] Weird paragraph: 

The Chindi

[#447] Weird paragraph: 

Some examples:

Job done


###### 3. BERT

In [6]:
bert_vector_size = 768

# 워드 임베딩
total_document_vectors = np.array([]).reshape(0, bert_vector_size)
for total_document in np.concatenate((X_train, X_test)):
    document_vectors = np.array([]).reshape(0, bert_vector_size)
    
    # tokenizing
    tokens = tokenizer(str(total_document),
                       return_tensors='pt',
                       max_length=512)
    
    # Word embedding
    outputs = bert_model(**tokens)
        
    # Feature vector of a document (game)
    document_vectors = outputs.last_hidden_state.mean(axis=1)
    
    # Append to total document vector
    total_document_vectors = np.vstack((total_document_vectors, document_vectors.detach().numpy()))
    
bert_X_train = total_document_vectors[:X_train.shape[0]]
bert_X_test = total_document_vectors[X_train.shape[0]:]

# Debug
print('Job done')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Job done


###### 1-4. 차원 축소

###### a. PCA

In [7]:
pca_var_ratio = 0.95
pca = PCA(n_components=pca_var_ratio)

# 1. TF-IDF
original_dimension = tfidf_X_train.shape[1]
tfidf_pca_X_train = pca.fit_transform(tfidf_X_train)
tfidf_pca_X_test = pca.transform(tfidf_X_test)

print(f'Tf-Idf Reduced Dimension: {original_dimension} -> {tfidf_pca_X_train.shape[1]}')

# 2. W2V
original_dimension = w2v_X_train.shape[1]
w2v_pca_X_train = pca.fit_transform(w2v_X_train)
w2v_pca_X_test = pca.transform(w2v_X_test)

print(f'W2V Reduced Dimension: {original_dimension} -> {w2v_pca_X_train.shape[1]}')

# 3. BERT
original_dimension = bert_X_train.shape[1]
bert_pca_X_train = pca.fit_transform(bert_X_train)
bert_pca_X_test = pca.transform(bert_X_test)

print(f'BERT Reduced Dimension: {original_dimension} -> {bert_pca_X_train.shape[1]}')

Tf-Idf Reduced Dimension: 4513 -> 440
W2V Reduced Dimension: 300 -> 118
BERT Reduced Dimension: 768 -> 162


###### b. LDA

In [8]:
# 1. TF-IDF
lda = LinearDiscriminantAnalysis()
tfidf_lda_X_train = lda.fit_transform(tfidf_X_train, y_train_le)
tfidf_lda_X_test = lda.transform(tfidf_X_test)

print(f'Tf-Idf Reduced Dimension: {original_dimension} -> {tfidf_lda_X_train.shape[1]}')

# 2. W2V
lda = LinearDiscriminantAnalysis()
pca_y_train_le = np.delete(y_train_le, w2v_weird_document_index[w2v_weird_document_index < X_train.shape[0]], axis=0)
w2v_lda_X_train = lda.fit_transform(w2v_X_train, pca_y_train_le)
w2v_lda_X_test = lda.transform(w2v_X_test)

print(f'W2V Reduced Dimension: {original_dimension} -> {w2v_lda_X_train.shape[1]}')

# 3. BERT
lda = LinearDiscriminantAnalysis()
bert_lda_X_train = lda.fit_transform(bert_X_train, y_train_le)
bert_lda_X_test = lda.transform(bert_X_test)

print(f'BERT Reduced Dimension: {original_dimension} -> {bert_lda_X_train.shape[1]}')

Tf-Idf Reduced Dimension: 768 -> 13
W2V Reduced Dimension: 768 -> 13
BERT Reduced Dimension: 768 -> 13


##### 1-5. 데이터 정리

In [9]:
tfidf_X_data = {'train_raw': tfidf_X_train,
                'train_pca': tfidf_pca_X_train,
                'train_lda': tfidf_lda_X_train,
                'test_raw': tfidf_X_test,
                'test_pca': tfidf_pca_X_test,
                'test_lda': tfidf_lda_X_test}

w2v_X_data = {'train_raw': w2v_X_train,
              'train_pca': w2v_pca_X_train,
              'train_lda': w2v_lda_X_train,
              'test_raw': w2v_X_test,
              'test_pca': w2v_pca_X_test,
              'test_lda': w2v_lda_X_test}

bert_X_data = {'train_raw': bert_X_train,
               'train_pca': bert_pca_X_train,
               'train_lda': bert_lda_X_train,
               'test_raw': bert_X_test,
               'test_pca': bert_pca_X_test,
               'test_lda': bert_lda_X_test}

X_data = {'tfidf': tfidf_X_data,
          'w2v': w2v_X_data,
          'bert': bert_X_data}

default_Y_train = {'ohe': y_train_ohe,
                   'le': y_train_le}

default_Y_test = {'ohe': y_test_ohe,
                  'le': y_test_le}

w2v_Y_train = {'ohe': np.delete(y_train_ohe, w2v_weird_document_index[w2v_weird_document_index < X_train.shape[0]], axis=0),
               'le': np.delete(y_train_le, w2v_weird_document_index[w2v_weird_document_index < X_train.shape[0]], axis=0)}

w2v_Y_test = {'ohe': np.delete(y_test_ohe, w2v_weird_document_index[w2v_weird_document_index >= X_train.shape[0]] - X_train.shape[0], axis=0),
              'le': np.delete(y_test_le, w2v_weird_document_index[w2v_weird_document_index >= X_train.shape[0]] - X_train.shape[0], axis=0)}

tfidf_Y_data = {'train_raw': default_Y_train,
                'train_pca': default_Y_train,
                'train_lda': default_Y_train,
                'test_raw': default_Y_test,
                'test_pca': default_Y_test,
                'test_lda': default_Y_test}

w2v_Y_data = {'train_raw': w2v_Y_train,
              'train_pca': w2v_Y_train,
              'train_lda': w2v_Y_train,
              'test_raw': w2v_Y_test,
              'test_pca': w2v_Y_test,
              'test_lda': w2v_Y_test}

bert_Y_data = {'train_raw': default_Y_train,
               'train_pca': default_Y_train,
               'train_lda': default_Y_train,
               'test_raw': default_Y_test,
               'test_pca': default_Y_test,
               'test_lda': default_Y_test}

Y_data = {'tfidf': tfidf_Y_data,
          'w2v': w2v_Y_data,
          'bert': bert_Y_data}

#### 2. 모델 정의

##### 2-0. 결과 저장용 테이블 준비

In [10]:
time_table = pd.DataFrame(columns=['we type', 'dim reduction', 'classifier', 'time'])
result_table = pd.DataFrame(columns=['index', 'data type', 'we type', 'dim reduction', 'classifier', 'truth', 'prediction'])
summary_table = pd.DataFrame(columns=['data type', 'we type', 'dim reduction', 'classifier', 'f1', 'auc'])

##### 2-1. LR

In [11]:
def LR_process(X_data: Dict,
               Y_data: Dict,
               we_type: str,
               dim_reduction_type: str,
               time_table: pd.DataFrame,
               result_table: pd.DataFrame,
               summary_table: pd.DataFrame) -> None:
    # 변수 정의
    classifier_model = 'LR'
    
    # 시간 측정
    timer = time.time()

    # LR 학습
    lr_model = LogisticRegression(multi_class='multinomial')
    lr_model.fit(X_data[we_type][f'train_{dim_reduction_type}'],
                 Y_data[we_type][f'train_{dim_reduction_type}']['le']) #label-encoding 형태로 받음

    # 학습데이터 추론
    train_prediction = lr_model.predict(X_data[we_type][f'train_{dim_reduction_type}'])
    train_prediction_ohe = ohe.transform(le.inverse_transform(train_prediction.astype(int)).reshape(-1, 1))

    f1 = f1_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                  train_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                        train_prediction_ohe)
    
    for index in range(train_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_train_index[index],
                                                   'train',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'train_{dim_reduction_type}']['le'][index],
                                                   train_prediction[index]]

    summary_table.loc[result_table.shape[0]] = ['train',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]

    # 검증데이터 추론
    test_prediction = lr_model.predict(X_data[we_type][f'test_{dim_reduction_type}'])
    test_prediction_ohe = ohe.transform(le.inverse_transform(test_prediction.astype(int)).reshape(-1, 1))
    
    f1 = f1_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                  test_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                        test_prediction_ohe)
    for index in range(test_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_test_index[index],
                                                   'test',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'test_{dim_reduction_type}']['le'][index],
                                                   test_prediction[index]]

    summary_table.loc[summary_table.shape[0]] = ['test',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]
    
    time_table.loc[time_table.shape[0]] = [we_type,
                                             dim_reduction_type,
                                             classifier_model,
                                             time.time() - timer]
    
    print(f'Elapsed time for {we_type}-{dim_reduction_type} {classifier_model} training: {(time.time() - timer)/60:0.2f} min')

##### 2-2. RF

In [12]:
def RF_process(X_data: Dict,
               Y_data: Dict,
               we_type: str,
               dim_reduction_type: str,
               time_table: pd.DataFrame,
               result_table: pd.DataFrame,
               summary_table: pd.DataFrame) -> None:
    # 변수 정의
    classifier_model = 'RF'
    
    # 시간 측정
    timer = time.time()

    # 하이퍼파라미터 탐색공간 설정
    param_space = {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 32),
        'min_samples_split': Real(0.1, 1.0),
        'min_samples_leaf': Real(0.1, 0.5),
        'max_features': Categorical(['sqrt', 'log2'])
    }

    # 하이퍼파라미터 튜닝
    rf_model = RandomForestClassifier()
    opt = BayesSearchCV(rf_model, param_space, n_iter=50, cv=10, n_jobs=-1, n_points=8, verbose=0)

    # timer = time.time()
    # opt.fit(X_train, y_train_le.ravel())
    opt.fit(X_data[we_type][f'train_{dim_reduction_type}'],
            Y_data[we_type][f'train_{dim_reduction_type}']['le'])

    # 모델 학습
    rf_model = RandomForestClassifier(**opt.best_params_)
    rf_model.fit(X_data[we_type][f'train_{dim_reduction_type}'],
                 Y_data[we_type][f'train_{dim_reduction_type}']['le'])
    
    # 학습데이터 추론
    train_prediction = rf_model.predict(X_data[we_type][f'train_{dim_reduction_type}'])
    train_prediction_ohe = ohe.transform(le.inverse_transform(train_prediction.astype(int)).reshape(-1, 1))

    f1 = f1_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                  train_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                        train_prediction_ohe)
    
    for index in range(train_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_train_index[index],
                                                   'train',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'train_{dim_reduction_type}']['le'][index],
                                                   train_prediction[index]]

    summary_table.loc[result_table.shape[0]] = ['train',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]

    # 검증데이터 추론
    test_prediction = rf_model.predict(X_data[we_type][f'test_{dim_reduction_type}'])
    test_prediction_ohe = ohe.transform(le.inverse_transform(test_prediction.astype(int)).reshape(-1, 1))
    
    f1 = f1_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                  test_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                        test_prediction_ohe)
    for index in range(test_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_test_index[index],
                                                   'test',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'test_{dim_reduction_type}']['le'][index],
                                                   test_prediction[index]]

    summary_table.loc[summary_table.shape[0]] = ['test',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]
    
    time_table.loc[time_table.shape[0]] = [we_type,
                                             dim_reduction_type,
                                             classifier_model,
                                             time.time() - timer]
    
    print(f'Elapsed time for {we_type}-{dim_reduction_type} {classifier_model} training: {(time.time() - timer)/60:0.2f} min')

##### 2-3. XGB

In [13]:
def XGB_process(X_data: Dict,
               Y_data: Dict,
               we_type: str,
               dim_reduction_type: str,
               time_table: pd.DataFrame,
               result_table: pd.DataFrame,
               summary_table: pd.DataFrame) -> None:
    # 변수 정의
    classifier_model = 'XGB'
    
    # 시간 측정
    timer = time.time()

    # 하이퍼파라미터 탐색공간 설정
    param_space = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'max_depth': (1, 50),
        'n_estimators': (50, 200),
        'min_child_weight': (1, 10),
        'subsample': (0.1, 1.0, 'uniform'),
        'gamma': (0.1, 5.0),
        'colsample_bytree': (0.1, 1.0, 'uniform'),
        'alpha': (0.0, 1.0, 'uniform'),
        'reg_lambda': (0.0, 1.0, 'uniform')
    }

    # 하이퍼파라미터 튜닝
    xgb_model = xgb.XGBClassifier(objective='multi:softmax')
    opt = BayesSearchCV(xgb_model, param_space, n_iter=50, cv=10, n_jobs=-1, n_points=8, verbose=0)

    # timer = time.time()
    # opt.fit(X_train, y_train_le.ravel())
    opt.fit(X_data[we_type][f'train_{dim_reduction_type}'],
            Y_data[we_type][f'train_{dim_reduction_type}']['le'])

    # 모델 학습
    xgb_model = xgb.XGBClassifier(objective='multi:softmax', **opt.best_params_)
    xgb_model.fit(X_data[we_type][f'train_{dim_reduction_type}'],
                  Y_data[we_type][f'train_{dim_reduction_type}']['le'])

    # 학습데이터 추론
    train_prediction = xgb_model.predict(X_data[we_type][f'train_{dim_reduction_type}'])
    train_prediction_ohe = ohe.transform(le.inverse_transform(train_prediction.astype(int)).reshape(-1, 1))

    f1 = f1_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                  train_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                        train_prediction_ohe)
    
    for index in range(train_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_train_index[index],
                                                   'train',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'train_{dim_reduction_type}']['le'][index],
                                                   train_prediction[index]]

    summary_table.loc[result_table.shape[0]] = ['train',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]

    # 검증데이터 추론
    test_prediction = xgb_model.predict(X_data[we_type][f'test_{dim_reduction_type}'])
    test_prediction_ohe = ohe.transform(le.inverse_transform(test_prediction.astype(int)).reshape(-1, 1))
    
    f1 = f1_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                  test_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                        test_prediction_ohe)
    for index in range(test_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_test_index[index],
                                                   'test',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'test_{dim_reduction_type}']['le'][index],
                                                   test_prediction[index]]

    summary_table.loc[summary_table.shape[0]] = ['test',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]
    
    time_table.loc[time_table.shape[0]] = [we_type,
                                             dim_reduction_type,
                                             classifier_model,
                                             time.time() - timer]
    
    print(f'Elapsed time for {we_type}-{dim_reduction_type} {classifier_model} training: {(time.time() - timer)/60:0.2f} min')

##### 2-4. SVM

In [14]:
def SVM_process(X_data: Dict,
                Y_data: Dict,
                we_type: str,
                dim_reduction_type: str,
                time_table: pd.DataFrame,
                result_table: pd.DataFrame,
                summary_table: pd.DataFrame) -> None:
    # 변수 정의
    classifier_model = 'SVM'
    
    # 시간 측정
    timer = time.time()

    # 하이퍼파라미터 탐색공간 설정
    param_space = {
        'C': (1, 10000),
        'gamma': (0.01, 1.0)
    }

    # 하이퍼파라미터 튜닝
    svc_rbf = SVC(kernel='rbf')
    opt = BayesSearchCV(svc_rbf, param_space, n_iter=50, cv=10, n_jobs=-1, n_points=8, verbose=0)
    opt.fit(X_data[we_type][f'train_{dim_reduction_type}'],
            Y_data[we_type][f'train_{dim_reduction_type}']['le'])

    # 모델 학습
    svc_rbf = SVC(kernel='rbf', **opt.best_params_)
    svc_rbf.fit(X_data[we_type][f'train_{dim_reduction_type}'],
                Y_data[we_type][f'train_{dim_reduction_type}']['le'])

    # 학습데이터 추론
    train_prediction = svc_rbf.predict(X_data[we_type][f'train_{dim_reduction_type}'])
    train_prediction_ohe = ohe.transform(le.inverse_transform(train_prediction.astype(int)).reshape(-1, 1))

    f1 = f1_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                  train_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'train_{dim_reduction_type}']['ohe'],
                        train_prediction_ohe)
    
    for index in range(train_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_train_index[index],
                                                   'train',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'train_{dim_reduction_type}']['le'][index],
                                                   train_prediction[index]]

    summary_table.loc[result_table.shape[0]] = ['train',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]

    # 검증데이터 추론
    test_prediction = svc_rbf.predict(X_data[we_type][f'test_{dim_reduction_type}'])
    test_prediction_ohe = ohe.transform(le.inverse_transform(test_prediction.astype(int)).reshape(-1, 1))
    
    f1 = f1_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                  test_prediction_ohe,
                  average='weighted')
    auc = roc_auc_score(Y_data[we_type][f'test_{dim_reduction_type}']['ohe'],
                        test_prediction_ohe)
    for index in range(test_prediction.shape[0]):
        result_table.loc[result_table.shape[0]] = [X_test_index[index],
                                                   'test',
                                                   we_type,
                                                   dim_reduction_type,
                                                   classifier_model,
                                                   Y_data[we_type][f'test_{dim_reduction_type}']['le'][index],
                                                   test_prediction[index]]

    summary_table.loc[summary_table.shape[0]] = ['test',
                                                we_type,
                                                dim_reduction_type,
                                                classifier_model,
                                                f1,
                                                auc]
    
    time_table.loc[time_table.shape[0]] = [we_type,
                                             dim_reduction_type,
                                             classifier_model,
                                             time.time() - timer]
    
    print(f'Elapsed time for {we_type}-{dim_reduction_type} {classifier_model} training: {(time.time() - timer)/60:0.2f} min')

##### 3. Experiments

In [15]:
we_types = ['tfidf', 'w2v', 'bert']
dim_reduction_types = ['raw', 'pca', 'lda']
classifier_functions = [LR_process, RF_process, XGB_process, SVM_process]

for we_type in we_types:
    for dim_reduction_type in dim_reduction_types:
        for classifier_function in classifier_functions:
            classifier_function(X_data=X_data,
                                Y_data=Y_data,
                                we_type=we_type,
                                dim_reduction_type=dim_reduction_type,
                                time_table=time_table,
                                result_table=result_table,
                                summary_table=summary_table)

Elapsed time for tfidf-raw LR training: 0.01 min




Elapsed time for tfidf-raw RF training: 0.22 min




Elapsed time for tfidf-raw XGB training: 4.65 min




Elapsed time for tfidf-raw SVM training: 4.45 min
Elapsed time for tfidf-pca LR training: 0.01 min




Elapsed time for tfidf-pca RF training: 0.20 min




Elapsed time for tfidf-pca XGB training: 7.16 min




Elapsed time for tfidf-pca SVM training: 0.21 min
Elapsed time for tfidf-lda LR training: 0.01 min




Elapsed time for tfidf-lda RF training: 0.28 min




Elapsed time for tfidf-lda XGB training: 0.40 min




Elapsed time for tfidf-lda SVM training: 0.17 min
Elapsed time for w2v-raw LR training: 0.01 min




Elapsed time for w2v-raw RF training: 0.24 min




Elapsed time for w2v-raw XGB training: 3.77 min




Elapsed time for w2v-raw SVM training: 0.19 min
Elapsed time for w2v-pca LR training: 0.01 min




Elapsed time for w2v-pca RF training: 0.24 min




Elapsed time for w2v-pca XGB training: 1.40 min




Elapsed time for w2v-pca SVM training: 0.16 min
Elapsed time for w2v-lda LR training: 0.01 min




Elapsed time for w2v-lda RF training: 0.32 min




Elapsed time for w2v-lda XGB training: 0.45 min




Elapsed time for w2v-lda SVM training: 0.17 min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time for bert-raw LR training: 0.01 min




Elapsed time for bert-raw RF training: 0.31 min




Elapsed time for bert-raw XGB training: 16.76 min




Elapsed time for bert-raw SVM training: 0.24 min
Elapsed time for bert-pca LR training: 0.01 min




Elapsed time for bert-pca RF training: 0.23 min




Elapsed time for bert-pca XGB training: 1.32 min




Elapsed time for bert-pca SVM training: 0.19 min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time for bert-lda LR training: 0.01 min




Elapsed time for bert-lda RF training: 0.33 min




Elapsed time for bert-lda XGB training: 0.41 min




Elapsed time for bert-lda SVM training: 0.18 min


#### 4. 결과 저장

In [16]:
summary_table.pivot_table(index=['data type', 'we type', 'dim reduction', 'classifier'],
                         values=['f1', 'auc'],
                         aggfunc=['mean']).to_excel('result/result_summary.xlsx')

time_table.pivot_table(index=['we type', 'dim reduction', 'classifier'],
                         values=['time'],
                         aggfunc=['mean']).to_excel('result/elapsed_time.xlsx')

result_table.to_excel('result/result_table.xlsx')

# 완료 Notification
SMTPSender.send_mail(subject='[Kei] pss process has been done', message='Job done.')

In [None]:
group_mapper = {'design': 'Production',
                'feature': 'Production',
                'bugs': 'Production',
                'optimization': 'Production',
                'tool': 'Production',
                'time': 'Production',
                'testing': 'Production',
                'marketing': 'Business',
                'unrelated': 'ETC',
                'planning': 'Management',
                'communication': 'Management',
                'budget': 'Management',
                'team': 'Management',
                'documentation': 'Management'}