In [5]:
with open('Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [6]:
docs_words = [doc.strip().split() for doc in texts]

In [7]:
print(docs_words[:2])

[['부산', '행', '때문', '너무', '기대하고', '봤'], ['한국', '좀비', '영화', '어색하지', '않게', '만들어졌', '놀랍']]


In [8]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_docs = [TaggedDocument(doc, tags=[i]) for i, doc in enumerate(docs_words) if doc != None]

In [9]:
tagged_docs[:2]

[TaggedDocument(words=['부산', '행', '때문', '너무', '기대하고', '봤'], tags=[0]),
 TaggedDocument(words=['한국', '좀비', '영화', '어색하지', '않게', '만들어졌', '놀랍'], tags=[1])]

In [22]:
model = Doc2Vec(tagged_docs, vector_size=100, min_count=3, window=3, epochs=100, dm=1, negative=5, 
               alpha=0.001)

In [23]:
len(model.dv)

165384

In [24]:
len(texts)

165384

In [13]:
len(labels)

165384

In [43]:
texts[0]

'부산 행 때문 너무 기대하고 봤'

In [45]:
import numpy as np
np.dot(model.dv[89881], model.dv[0])/(np.linalg.norm(model.dv[89881])*np.linalg.norm(model.dv[0]))

0.753497

In [46]:
for id, sim in model.dv.most_similar(0, topn=10):
    print('reivew ID: {}, review: {}, similarity: {}'.format(id, texts[id],sim))

reivew ID: 89881, review: 호평 때문 기대하고 봤 기대 초과, similarity: 0.7534970641136169
reivew ID: 102986, review: 영 상미 넘 예쁨 애기 때문 빵빵 터졌, similarity: 0.7008088231086731
reivew ID: 161929, review: 화끈 액션 씬 때문 손 땀 나더, similarity: 0.6940357089042664
reivew ID: 10810, review: 시종일관 화려한 스케일 때문 눈 떼지 했, similarity: 0.6846963763237
reivew ID: 106992, review: 오베 남자 때문 울 웃다, similarity: 0.6795190572738647
reivew ID: 14517, review: 도대체 왜 때문, similarity: 0.6789843440055847
reivew ID: 155429, review: 음 부산 행 때문 보시 분 굳이 보실 필요 없 것 같아, similarity: 0.674677848815918
reivew ID: 7252, review: 흥미로운 스토리 때문 눈 뗄 수 없었, similarity: 0.6687542200088501
reivew ID: 45464, review: 지루한 부분 때문 기대 미치, similarity: 0.6607589721679688
reivew ID: 54255, review: 다양한 캐릭터 때문 보는 재미 쏠쏠 했, similarity: 0.6599541306495667


In [33]:
docs_vectors = []
for i in range(len(texts)):
    docs_vectors.append(model.dv[i])

In [34]:
len(docs_vectors)

165384

In [35]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(docs_vectors, labels, test_size=0.2, random_state=0)

In [36]:
print(len(train_features), len(train_labels))

132307 132307


In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga') 
lr2.fit(train_features, train_labels) 
pred_labels = lr2.predict(test_features)

In [47]:
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Accuracy: 0.79


In [48]:
lr1 = LogisticRegression(C=1, penalty='l1', solver='saga') 
lr1.fit(train_features, train_labels) 
pred_labels = lr1.predict(test_features)
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Accuracy: 0.79
