In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('2016_filtered_review_part.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [3]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 < score < 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, 0
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score >= 8 else 0)

In [4]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.1, random_state=42)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer() 
tf_train_features = tf_vectorizer.fit_transform(train_texts) 
tf_test_features = tf_vectorizer.transform(test_texts)

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
lr_p1 = LogisticRegression(C=1, penalty='l1', solver='saga', max_iter=10000) 

## K-fold cross validation

In [8]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [9]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr_p1, tf_train_features, train_labels, cv=cv)

In [10]:
scores

array([0.94137735, 0.93621868, 0.93963554, 0.94077449, 0.93792711,
       0.93906606, 0.94931663, 0.93394077, 0.93507973, 0.94020501])

In [11]:
scores.mean()

0.9393541356863466

In [12]:
lr_p2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000) 

In [13]:
scores2 = cross_val_score(lr_p2, tf_train_features, train_labels, cv=cv)

In [14]:
scores2.mean()

0.9404359133592541

# Stratified K-fold

In [15]:
from sklearn.model_selection import StratifiedKFold
scv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

In [16]:
scores3 = cross_val_score(lr_p1, tf_train_features, train_labels, cv=scv)

In [17]:
scores3.mean()

0.9378167771478354

StratifiedKFold 클래스를 사용하지 않고 계층적 K-fold 교차 검증하기

In [18]:
scores4 = cross_val_score(lr_p1, tf_train_features, train_labels, cv=10)

In [19]:
scores4.mean()

0.9383859939350959

# Gridsearch

In [20]:
# 그리드 탐색에서 사용하고자 하는 기본 모형
lr_model = LogisticRegression(solver='saga', max_iter=10000) 

In [21]:
from sklearn.model_selection import GridSearchCV
LR_params = {
    'penalty' : ['l1', 'l2'],
    'C': [0.1, 0.5, 1, 2]
}

In [22]:
grid_search = GridSearchCV(lr_model, scoring='f1_macro', param_grid=LR_params, cv=5)

In [23]:
grid_search.fit(tf_train_features, train_labels)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=10000, solver='saga'),
             param_grid={'C': [0.1, 0.5, 1, 2], 'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [29]:
grid_search.cv_results_['mean_test_score']

array([0.65612093, 0.66837602, 0.72744873, 0.72822862, 0.74056478,
       0.74542467, 0.75410575, 0.7539371 ])

In [28]:
grid_search.cv_results_['params']

[{'C': 0.1, 'penalty': 'l1'},
 {'C': 0.1, 'penalty': 'l2'},
 {'C': 0.5, 'penalty': 'l1'},
 {'C': 0.5, 'penalty': 'l2'},
 {'C': 1, 'penalty': 'l1'},
 {'C': 1, 'penalty': 'l2'},
 {'C': 2, 'penalty': 'l1'},
 {'C': 2, 'penalty': 'l2'}]

In [25]:
grid_search.best_params_

{'C': 2, 'penalty': 'l1'}

In [32]:
best_model = grid_search.best_estimator_
# Best estimator (모형)을 return합니다. 
pred_labels = best_model.predict(tf_test_features)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, pred_labels)

0.9431352459016393

In [34]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.66      0.41      0.50       138
           1       0.96      0.98      0.97      1814

    accuracy                           0.94      1952
   macro avg       0.81      0.69      0.74      1952
weighted avg       0.94      0.94      0.94      1952

