###  필요한 Library 불러오기

In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

import rich 
from rich.table import Table

### 데이터 불러오기

In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

### 전처리

In [3]:
# 두 칸 이상의 빈칸을 한칸으로 바꿔줌

def preprocess(text):
    text = re.sub("\s+", " ", text)
    return text

train_df["document"]  = train_df["document"].apply(lambda x : preprocess(x))
test_df["document"]  = test_df["document"].apply(lambda x : preprocess(x))

### 파이프라인 및 모델 정의

In [4]:
def get_pipe(model, model_name: str) -> Pipeline:
    "TfidfVectorizer와 모델을 연결한 파이프라인을 반환하는 함수"
    tfidf = TfidfVectorizer(analyzer="char", ngram_range=(1, 3), sublinear_tf=True)
    pipe = Pipeline([
        ("tfidf", tfidf),
        (model_name, model)
    ])
    return pipe

def return_kfold_accuarcy(model, k: int = 5) -> float:
    "모델을 입력받아 KFold 예측 후 accuracy score를 반환하는 함수"
    kfold = StratifiedKFold(k, shuffle=True, random_state=42)
    result = []
    cnt = 0
    for train_idx, test_idx in kfold.split(train_df["document"], train_df["label"]):
        train, val = train_df.iloc[train_idx], train_df.iloc[test_idx]
        model.fit(train["document"], train["label"])
        pred = model.predict(val["document"])
        acc = accuracy_score(val["label"], pred)
        result.append(acc)
        cnt += 1
        print(f'{cnt}th complete!')

    return np.mean(result)

## 모델 튜닝 및 정의

In [345]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_curve, f1_score, auc

def pr_auc_score(y_true, y_score):
    """
    Generates the Area Under the Curve for precision and recall.
    """
    precision, recall, thresholds = \
        precision_recall_curve(y_true, y_score)
    return auc(recall, precision)

pr_auc_scorer = make_scorer(pr_auc_score, greater_is_better=True,
                            needs_proba=True)

vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 3), sublinear_tf=True)
trainvec = vectorizer.fit_transform(train_df["document"])

In [1]:
# param_grid_nb={'alpha':[1e-2, 1e-1, 0.5, 1, 2, 5, 1e1, 20, 1e2,]}
# nb_cv = GridSearchCV(BernoulliNB(), param_grid_nb,
#                      cv=5, scoring = pr_auc_scorer, verbose=2)
# nb_cv.fit(trainvec, train_df["label"])
# print(nb_cv.best_params_)

In [2]:
# param_grid_sgd = {'average': [True, False],
#               'l1_ratio': np.linspace(0, 1, num=10),
#               'alpha': np.power(10, np.arange(-4, 1, dtype=float))}

# sgd_cv = GridSearchCV(SGDClassifier(loss='hinge', penalty='elasticnet', fit_intercept=True,
#                           random_state=42, n_jobs=-1,
#                           early_stopping=True, validation_fraction=0.2),
#                       param_grid_sgd, cv=5, scoring = 'accuracy', verbose=2)

# sgd_cv.fit(trainvec, train_df["label"])
# print(sgd_cv.best_params_)

In [3]:
# param_grid_logistic={'C':[1e-2, 1e-1, 1, 2, 5, 1e1, 20, 1e2,]}
# logistic_cv = GridSearchCV(LogisticRegression(class_weight='balanced', penalty = 'l2', max_iter=500), param_grid_logistic,
#                            cv=5, scoring = pr_auc_scorer, verbose=2)
# logistic_cv.fit(trainvec, train_df["label"])
# print(logistic_cv.best_params_)

In [4]:
# param_grid_ada={'n_estimators' : [100,300,500], 'learning_rate' : [0.01,0.05,0.1]}
# ada_cv = GridSearchCV(AdaBoostClassifier(random_state=42),
#                      param_grid_ada,
#                      cv=5, scoring = pr_auc_scorer,
#                      n_jobs=-1, verbose=2)
# ada_cv.fit(trainvec, train_df["label"])
# print(ada_cv.best_params_)

In [5]:
# param_grid_rf={'criterion':["gini","entropy"] ,'max_features':['sqrt','log2'],'max_depth': [8,10,12,15], 'n_estimators' : [1000,1500,2000]}
# rf_cv = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'),
#                      param_grid_rf,
#                      cv=5, scoring = pr_auc_scorer,
#                      n_jobs=-1, verbose=2)
# rf_cv.fit(trainvec, train_df["label"])
# print(rf_cv.best_params_)

In [6]:
# param_grid_svc={'C':[1e-2, 1e-1, 1, 1e1], 'gamma' : [1e-2, 1e-1, 1, 1e1], 'kernel' : ['rbf', 'linear']}
# svc_cv = GridSearchCV(SVC(random_state=42, probability=True),
#                      param_grid_svc,
#                      cv=5, scoring = pr_auc_scorer,
#                       n_jobs=-1, verbose=2)
# svc_cv.fit(trainvec, train_df["label"])
# print(svc_cv.best_params_)

In [5]:
models = [
    ("naive_bayes", BernoulliNB(alpha=0.5)),
    ("SGD", SGDClassifier(loss='hinge', penalty='elasticnet',fit_intercept=True,
                          alpha = 0.0001, average = True, l1_ratio = 0.0,
                          random_state=42, n_jobs=-1,
                          early_stopping=True, validation_fraction=0.2)),
    ("ada", AdaBoostClassifier(random_state=42, n_estimators = 500, learning_rate = 0.1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=500, random_state=42, C=100)),
    ("SVC", SVC(random_state=42, C = 10.0, gamma = 0.1, kernel = 'rbf'))
]

model_pipes = [(name, get_pipe(model, name)) for name, model in models]


### 모델 훈련

In [6]:
table = Table(title="Model Comparison Table")
table.add_column("Model Name", justify="left", style="green")
table.add_column("Accuracy", justify="right")

for model_name, model in tqdm(model_pipes, leave=False):
    print(f'##################### {model_name} #################### ')
    acc = return_kfold_accuarcy(model)
    table.add_row(model_name, f"{acc:0.3f}")

rich.print(table)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

##################### naive_bayes #################### 
1th complete!
2th complete!
3th complete!
4th complete!


 20%|████████████████▊                                                                   | 1/5 [00:01<00:07,  1.76s/it]

5th complete!
##################### SGD #################### 
1th complete!
2th complete!
3th complete!
4th complete!


 40%|█████████████████████████████████▌                                                  | 2/5 [00:03<00:05,  1.78s/it]

5th complete!
##################### ada #################### 
1th complete!
2th complete!
3th complete!
4th complete!


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [01:24<00:50, 25.44s/it]

5th complete!
##################### LR #################### 
1th complete!
2th complete!
3th complete!
4th complete!


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [01:28<00:19, 19.16s/it]

5th complete!
##################### SVC #################### 
1th complete!
2th complete!
3th complete!
4th complete!


                                                                                                                       

5th complete!




In [7]:
stack_models = [(name, get_pipe(model, name)) for name, model in models]
stacking = StackingClassifier(stack_models, verbose=2, n_jobs=-1)
acc = return_kfold_accuarcy(stacking)
rich.print(acc)

1th complete!
2th complete!
3th complete!
4th complete!
5th complete!


### 결과물 생성

In [8]:
stacking.fit(train_df["document"], train_df["label"])
submission_pred = stacking.predict(test_df["document"])

In [9]:
submission = pd.read_csv("./data/sample_submission.csv")
submission["label"] = submission_pred
submission

Unnamed: 0,id,label
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
4995,4996,0
4996,4997,0
4997,4998,1
4998,4999,0


In [10]:
submission.to_csv("./data/sub_movie.csv", index=False)