<h1>Data load & Preprocessing</h1>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
train = pd.read_csv('data/labeledTrainData.tsv',
                   header=0, delimiter='\t', quoting=3)
test = pd.read_csv('data/testData.tsv',
                   header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv',
                   header=0, delimiter='\t', quoting=3)

print(train.shape)
print(test.shape)
print(unlabeled_train.shape)

(25000, 3)
(25000, 2)
(50000, 2)


In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [5]:
from preprocessing import preprocessing
preprocessing.review_to_wordlist(train['review'][0])[:10]

['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with']

In [6]:
sentences = []
for review in train['review']:
    sentences += preprocessing.review_to_sentences(review) #stop word는 제거 ㄴㄴ

In [7]:
for review in unlabeled_train['review']:
    sentences += preprocessing.review_to_sentences(review)

In [8]:
len(sentences)

795538

In [9]:
sentences[0][:10]

['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with']

<h1>Word2Vec</h1>

In [10]:
# import logging
# logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s',
#                    level = logging.INFO)

In [11]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec

model = word2vec.Word2Vec(sentences,
                         workers = num_workers,
                         size = num_features,
                         min_count = min_word_count,
                         window = context,
                         sample = downsampling)

In [12]:
model.init_sims(replace = True)
model_name = '300features_40minwords_10text'
model.save(model_name)

<h1>Result</h1>

In [13]:
model.wv.doesnt_match('man woman child kitchen'.split()) # 유사도가 없는 단어 추출

'kitchen'

In [14]:
model.wv.doesnt_match('france england germany berlin'.split()) # 유사도가 없는 단어 추출

'berlin'

In [15]:
model.wv.most_similar('man')

[('woman', 0.6420372128486633),
 ('businessman', 0.5207472443580627),
 ('ladi', 0.5138169527053833),
 ('lad', 0.4942147433757782),
 ('millionair', 0.4918099045753479),
 ('widow', 0.48364362120628357),
 ('policeman', 0.48335811495780945),
 ('loner', 0.4781893789768219),
 ('farmer', 0.47404757142066956),
 ('men', 0.4699292778968811)]

In [16]:
model.wv.most_similar('queen')

[('princess', 0.6092647314071655),
 ('goddess', 0.5814031362533569),
 ('latifah', 0.5732750296592712),
 ('eva', 0.5604830384254456),
 ('regina', 0.5493510961532593),
 ('victoria', 0.5479620099067688),
 ('starlet', 0.5477876663208008),
 ('mistress', 0.5441542863845825),
 ('anita', 0.5435322523117065),
 ('angela', 0.5419000387191772)]

In [17]:
model.wv.most_similar('film')

[('movi', 0.8489617109298706),
 ('flick', 0.6171389818191528),
 ('documentari', 0.5549184083938599),
 ('pictur', 0.544985830783844),
 ('cinema', 0.5181078910827637),
 ('masterpiec', 0.5019605159759521),
 ('sequel', 0.5007549524307251),
 ('it', 0.4955504238605499),
 ('genr', 0.4718480706214905),
 ('thriller', 0.4688575267791748)]

In [18]:
model.wv.most_similar('happi')

[('unhappi', 0.43892407417297363),
 ('satisfi', 0.41629377007484436),
 ('sad', 0.4074321687221527),
 ('comfort', 0.4009477496147156),
 ('afraid', 0.39662617444992065),
 ('bitter', 0.3897116184234619),
 ('happier', 0.3885878324508667),
 ('proud', 0.3868008852005005),
 ('lucki', 0.3826589584350586),
 ('upset', 0.3727996051311493)]