In [1]:
import pandas as pd
import numpy as np

데이터 불러오기

In [2]:
with open('2016_filtered_review_part.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

평점 정보를 이용해서 종속변수 레이블링 하기

In [3]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 < score < 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, 0
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score >= 8 else 0)

In [4]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.1, random_state=42)

In [5]:
values, n_samples = np.unique(train_labels, return_counts=True)
print('0: {0} \n1: {1} \ntotal: {2}'.format(n_samples[0], n_samples[1], len(train_labels)))

0: 1485 
1: 16076 
total: 17561


In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer() 
tf_train_features = tf_vectorizer.fit_transform(train_texts) 
tf_test_features = tf_vectorizer.transform(test_texts)

In [8]:
lr_tf_l2 = LogisticRegression(C=0.1, penalty='l2', solver='saga', max_iter=10000)

In [9]:
lr_tf_l2.fit(tf_train_features, train_labels) # 학습

In [10]:
pred_labels_tf_l2 = lr_tf_l2.predict(tf_test_features) # 예측

In [11]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels_tf_l2))

              precision    recall  f1-score   support

           0       0.70      0.19      0.30       138
           1       0.94      0.99      0.97      1814

    accuracy                           0.94      1952
   macro avg       0.82      0.59      0.63      1952
weighted avg       0.92      0.94      0.92      1952



In [12]:
pred_probs = lr_tf_l2.predict_proba(tf_test_features)

In [13]:
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(test_labels, pred_probs[:,1])
auc_score

0.876394148570698

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, pred_labels_tf_l2)

array([[  26,  112],
       [  11, 1803]], dtype=int64)

# Cost Sensitive 방법

In [15]:
LR_params = {
    'class_weight': [{0:3,1:1}, {0:2,1:1}, {0:1,1:1}, {0:1,1:2}, {0:1,1:3}, 'balanced']
}

In [16]:
lr_cs = LogisticRegression(C=0.1, penalty='l2', solver='saga', max_iter=10000)

In [17]:
from sklearn.model_selection import GridSearchCV
gs_cs = GridSearchCV(lr_cs, param_grid=LR_params, cv=5)

In [18]:
gs_cs.fit(tf_train_features, train_labels)

In [19]:
gs_cs.best_params_

{'class_weight': {0: 2, 1: 1}}

In [20]:
best_model = gs_cs.best_estimator_
# Best estimator (모형)을 return합니다. 
pred_labels_cs = best_model.predict(tf_test_features)

In [21]:
print(classification_report(test_labels, pred_labels_cs))

              precision    recall  f1-score   support

           0       0.65      0.30      0.41       138
           1       0.95      0.99      0.97      1814

    accuracy                           0.94      1952
   macro avg       0.80      0.65      0.69      1952
weighted avg       0.93      0.94      0.93      1952



In [22]:
pred_probs_cs = best_model.predict_proba(tf_test_features)

In [23]:
auc_score_cs = roc_auc_score(test_labels, pred_probs_cs[:,1])
auc_score_cs

0.880688445744052

In [24]:
confusion_matrix(test_labels, pred_labels_cs)

array([[  42,   96],
       [  23, 1791]], dtype=int64)