In [1]:
import pandas as pd

DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

In [2]:
# word2vec 벡터화
# 하이퍼파라미터 정의
num_features = 300    # 워드 벡터 특징값 수 -> 한개의 단어가 해당 특징 값 수로 표현된 벡터로 구성
min_word_count = 40   # 단어에 대한 최소 빈도 수 -> 해당 값 이하로 사용되는 단어는 배제
num_workers = 4       # 프로세스 개수
context = 10          # 컨텍스트 윈도 크기
downsampling = 1e-3   # 다운 샘플링 비율

In [8]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.2.0.tar.gz (23.2 MB)
     |████████████████████████████████| 23.2 MB 142 kB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting smart_open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
     |████████████████████████████████| 57 kB 4.6 MB/s             
[?25hBuilding wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25ldone
[?25h  Created wheel for gensim: filename=gensim-4.2.0-cp36-cp36m-linux_x86_64.whl size=25519652 sha256=cea6ef687dda7829d1fb420f1d4661a2b87c8a2f6be8cb8778e05a90308d5e3b
  Stored in directory: /root/.cache/pip/wheels/44/1e/2b/b0056a533d057c3ed56c84fbdd79cca690496f4cd7c03c157c
Successfully built gensim
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-6.4.0


In [11]:
!pip install dataclasses

Collecting dataclasses
  Downloading dataclasses-0.8-py3-none-any.whl (19 kB)
Installing collected packages: dataclasses
Successfully installed dataclasses-0.8


In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
from gensim.models import word2vec
print("Training model...")

# 기본적으로 CBOW 방식으로 학습 (초기화 매개변수 sg=0 / sg=1일 경우 skip-gram 방식으로 학습)
model = word2vec.Word2Vec(sentences,
                          workers = num_workers,
                          vector_size = num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling)

2023-10-06 07:27:36,094 : INFO : collecting all words and their counts
2023-10-06 07:27:36,095 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2023-10-06 07:27:36,399 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2023-10-06 07:27:36,673 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2023-10-06 07:27:36,809 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2023-10-06 07:27:36,810 : INFO : Creating a fresh vocabulary
2023-10-06 07:27:36,918 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.02% of original 74065, drops 65905)', 'datetime': '2023-10-06T07:27:36.917412', 'gensim': '4.2.0', 'python': '3.6.9 (default, Nov  7 2019, 10:44:02) \n[GCC 8.3.0]', 'platform': 'Linux-5.15.0-84-generic-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2023-10-06 07:27:36,918 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 leaves 2627273 word corpus (87.92% of original 2988089, drops 360816)', 'datetime': '2023-10-06T07:27:36.918800', 'gensim': '4

In [7]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2023-10-05 10:20:14,722 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-10-05T10:20:14.722685', 'gensim': '4.2.0', 'python': '3.6.9 (default, Nov  7 2019, 10:44:02) \n[GCC 8.3.0]', 'platform': 'Linux-5.15.0-84-generic-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2023-10-05 10:20:14,724 : INFO : not storing attribute cum_table
2023-10-05 10:20:14,790 : INFO : saved 300features_40minwords_10context


In [4]:
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    # 어휘 사전 준비
    index2word_set = set(model.wv.index_to_key)
    
    for w in words:
        if w in index2word_set:
            num_words += 1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model.wv[w])
    # 문장의 단어 수만큼 나누어 단어 벡터의 평균값을 문장 벡터로 함
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector
# 단어 1개가 (300,)의 값으로 구성된 np배열
# 모든 문장의 벡터 값을 더하고, 단어의 개수만큼 나누어 평균을 냄
# 이렇게 하여 문장 1개를 (300,)의 값으로 구성된 np 배열로 출력 (단순한 방법)

In [8]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs
# reviews : 학습 데이터인 전체 리뷰 데이터를 입력하는 인자
# model : word2vec 모델을 입력하는 인자
# num_featrues : word2vec 모델 임베딩 차원 수

In [30]:
test_data_vecs = get_dataset(sentences, model, num_features)

In [34]:
print(len(sentences))
print(len(test_data_vecs))
print(test_data_vecs[0].shape) # 1개의 문장이 (300,)의 1차원 벡터로 구성됨

25000
25000
(300,)


In [35]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split
import numpy as np
X = test_data_vecs
y = np.array(sentiments)

RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [36]:
# 모델 학습 - 로지스틱 회귀
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight='balanced')

In [37]:
# 검증데이터로 성능 측정
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.866000


In [38]:
# 캐글 데이터 제출
TEST_CLEAN_DATA= 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_review = list(test_data['review'])

In [42]:
test_review[0]

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty'

In [41]:
len(test_review)

25000

In [43]:
# 하나의 문자열을 단어 리스트로 변환
test_sentences = []
for review in test_review:
    test_sentences.append(review.split())

In [44]:
# 전처리 함수
test_data_vecs = get_dataset(test_sentences, model, num_features)

In [45]:
DATA_OUT_PATH = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id':ids, 'sentiment':test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)