## TFIDF를 활용한 모델 구현

In [1]:
import os

import pandas as pd
import numpy as np

In [3]:
TRAIN_CLEAN_DATA = 'train_clean.csv'
DATA_IN_PATH = './data_in/' 

In [14]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, header=0)

In [17]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

### TF-IDF Vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf = True, ngram_range=(1,3), max_features = 5000)     

X = vectorizer.fit_transform(reviews)

In [25]:
X.shape

(25000, 5000)

In [37]:
print(len(train_data['review'][3]))
train_data['review'][3]

1459


'must assumed praised film \\ greatest filmed opera ever \\ read somewhere either care opera care wagner care anything except desire appear cultured either representation wagner swan song movie strikes unmitigated disaster leaden reading score matched tricksy lugubrious realisation text questionable people ideas opera matter play especially one shakespeare \\ about\\ allowed anywhere near theatre film studio syberberg fashionably without smallest justification wagner text decided parsifal \\ about\\ bisexual integration title character latter stages transmutes kind beatnik babe though one continues sing high tenor actors film singers get double dose armin jordan conductor seen face heard voice amfortas also appears monstrously double exposure kind batonzilla conductor ate monsalvat playing good friday music way transcendant loveliness nature represented scattering shopworn flaccid crocuses stuck ill laid turf expedient baffles theatre sometimes piece imperfections thoughts think syberb

### 학습과 검증 데이터셋 분리

In [38]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
TEST_SPLIT = 0.2

y = np.array(sentiments)

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size = TEST_SPLIT, random_state = RANDOM_SEED)

In [42]:
print(X_train.shape)
print(y_train.shape)

(20000, 5000)
(20000,)


In [46]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight = 'balanced')
lgs.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

class_weight를 balance로 설정해서 각 라벨에 대해 균형 있게 학습할 수 있게 했다.

## 검증 데이터 성능 평가

In [81]:
print("Accuracy: {}".format(lgs.score(X_eval, y_eval)))

Accuracy: 0.8578


## 평가 데이터 성능 평가

In [58]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [62]:
testDataVecs = vectorizer.transform(test_data['review'])

fit을 호출하지 않았다. fit은 학습 데이터에 맞게 설정하고 그 설정에 맞게 평가 데이터도 변환하면 된다.

In [64]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[0 1 0 ... 0 0 1]


In [68]:
DATA_OUT_PATH = './data_out/'

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv')

In [80]:
answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)