In [176]:
import re
import pandas as pd
import numpy as np
import json
import os

DATA_IN_PATH='./data_in/'
TRAIN_CLEAN_DATA='train_clean.csv'

train_data=pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews=list(train_data['review'])
sentiments=list(train_data['sentiment'])

sentences=[]
for review in reviews:
    sentences.append(review.split())

In [177]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [178]:
# 학습 시 필요한 하이퍼 파라미터

# 워드 벡터 특징값 수
num_features=300
# 단어에 대한 최소 빈도 수
min_word_count=40
# 프로세스 개수
num_workers=4
# 컨텍스트 윈도우 크기
context=10
# 다운 샘플링 비율
downsampling=1e-3

In [179]:
from gensim.models import word2vec
print("Training model...")
model=word2vec.Word2Vec(sentences,
                       workers=num_workers,
                       size=num_features,
                       min_count=min_word_count,
                       window=context,
                       sample=downsampling)

2020-12-14 20:32:51,093 : INFO : collecting all words and their counts
2020-12-14 20:32:51,094 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-12-14 20:32:51,267 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types


Training model...


2020-12-14 20:32:51,435 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2020-12-14 20:32:51,522 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2020-12-14 20:32:51,523 : INFO : Loading a fresh vocabulary
2020-12-14 20:32:51,561 : INFO : effective_min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2020-12-14 20:32:51,562 : INFO : effective_min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2020-12-14 20:32:51,582 : INFO : deleting the raw counts dictionary of 74065 items
2020-12-14 20:32:51,584 : INFO : sample=0.001 downsamples 30 most-common words
2020-12-14 20:32:51,584 : INFO : downsampling leaves estimated 2494384 word corpus (94.9% of prior 2627273)
2020-12-14 20:32:51,601 : INFO : estimated required memory for 8160 words and 300 dimensions: 23664000 bytes
2020-12-14 20:32:51,602 : INFO : resetting layer weights
2020-12-14 20:32:52,866 : INFO : t

In [180]:
# 모델의 하이퍼파라미터를 설정한 내용을 모델 이름에 담는다면 나중에 참고하기 좋음
# 모델을 저장하면 Word2Vec.load()로 모델을 다시 사용 가능
model_name="300features_40minwords_10context"
model.save(model_name)

2020-12-14 20:33:10,220 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2020-12-14 20:33:10,221 : INFO : not storing attribute vectors_norm
2020-12-14 20:33:10,222 : INFO : not storing attribute cum_table
2020-12-14 20:33:10,366 : INFO : saved 300features_40minwords_10context


In [181]:
print(model)

Word2Vec(vocab=8160, size=300, alpha=0.025)


In [182]:
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector=np.zeros((num_features), dtype=np.float32)
    
    num_words=0
    # 어휘사전 준비
    index2word_set=set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words+=1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector=np.add(feature_vector, model[w])
    
    # 문장의 단어 수만큼 나누어 단어 벡터의 평균값을 문장 벡터로 함
    feature_vector=np.divide(feature_vector, num_words)
    return feature_vector

In [183]:
def get_dataset(reviews, model, num_features):
    dataset=list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
    
    reviewFeatureVecs=np.stack(dataset)
    
    return reviewFeatureVecs

In [187]:
train_data_vecs=get_dataset(sentences, model, num_features)

  del sys.path[0]


In [188]:
from sklearn.model_selection import train_test_split

X=train_data_vecs
y=np.array(sentiments)

RANDOM_SEED=42
TEST_SPLIT=0.2

X_train, X_eval, y_train, y_eval=train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [189]:
from sklearn.linear_model import LogisticRegression

lgs=LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight='balanced')

In [190]:
# 검증 데이터로 성능 측정
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.865000


In [191]:
TEST_CLEAN_DATA='test_clean.csv'

test_data=pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA)

test_review=list(test_data['review'])

In [192]:
test_sentences=[]
for review in test_data:
    test_sentences.append(review.split())

In [193]:
test_data_vecs=get_dataset(test_sentences, model, num_features)

  del sys.path[0]


In [194]:
DATA_OUT_PATH='./data_out/'

test_predicted=lgs.predict(test_data_vecs)

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
ids=list(test_data['id'])
answer_dataset=pd.DataFrame({'id':ids, 'sentiment':test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH+'lgs_answer.csv',index=False)

ValueError: arrays must all be same length

In [79]:
answer_dataset.to_csv(DATA_OUT_PATH+'lgs_answer.csv',index=False)

In [195]:
print(test_data_vecs)

[[-9.72986460e-01  6.31504834e-01 -3.57771724e-01 -2.81789601e-01
  -1.21047187e+00 -1.55673996e-01 -9.60967839e-01  1.17620265e+00
   4.61706102e-01  3.65030825e-01  5.66152573e-01 -3.85412216e-01
   4.48002726e-01  8.75946641e-01 -5.48009157e-01 -8.99554133e-01
  -4.81081232e-02  3.30140620e-01  7.95071125e-01  1.69979185e-02
   1.64716673e+00  6.37997508e-01  1.42131448e-02 -1.53598130e-01
   6.97971284e-01 -1.20773897e-01  3.51476014e-01 -4.56965476e-01
  -9.45024371e-01 -6.73180640e-01  3.69526148e-01 -1.06277108e-01
   1.23835906e-01  8.41993570e-01 -8.98249567e-01 -8.46701860e-01
  -7.98762739e-01  8.82960916e-01  5.56514263e-02  6.63250566e-01
  -4.59827751e-01 -5.96438348e-01 -7.60984242e-01 -7.30535686e-01
   2.12584510e-02 -3.92530799e-01 -4.57641751e-01 -1.55915523e+00
  -3.82994622e-01  9.00938094e-01 -8.54328632e-01  1.82578242e+00
  -1.16373754e+00  1.24283887e-01  2.36625239e-01 -5.32458484e-01
  -1.03909917e-01  8.09370577e-01  4.96192098e-01  3.40666920e-01
  -2.38554

In [142]:
print(len(test_data_vecs[1]))

300


In [None]:
test_data_vecs[0]