In [1]:
import pandas as pd
train = pd.read_csv('data/labeledTrainData.tsv', 
                    header=0, delimiter='\t', quoting=3)
test = pd.read_csv('data/testData.tsv', 
                   header=0, delimiter='\t', quoting=3)
# unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv', 
#                               header=0, delimiter='\t', quoting=3)

In [2]:
import numpy as np

In [3]:
#주어진 문장에서 단어 벡터의 평균을 구하는 함수
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word) #모델의 사전에 있는 단어명을 담은 리스트
    
    #루프를 돌면서 모델 사전에 포함이 되는 단어라면 피처에 추가
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
    
    #평균
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [4]:
#리뷰 단어 목록의 각각에 대한 평균 feature벡터를 계산하고 2D np.array를 반환
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0.
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        if(counter % 1000. == 0.):
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model, num_features)
        counter += 1
    return reviewFeatureVecs

In [5]:
from preprocessing import preprocessing
import gensim.models as g

model_name = '300features_40minwords_10text'
model = g.Doc2Vec.load(model_name)
num_features = 300



In [6]:
%time trainDataVecs = getAvgFeatureVecs([preprocessing.review_to_wordlist(review) for review in train['review']], model, num_features)

Review 0 of 25000


  # This is added back by InteractiveShellApp.init_path()


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Wall time: 2min 5s


In [8]:
%time testDataVecs = getAvgFeatureVecs([preprocessing.review_to_wordlist(review) for review in test['review']], model, num_features)

Review 0 of 25000


  # This is added back by InteractiveShellApp.init_path()


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Wall time: 2min 2s


In [9]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 2018)

In [11]:
%time forest = forest.fit(trainDataVecs, train['sentiment'])

Wall time: 4.99 s


In [12]:
from sklearn.model_selection import cross_val_score
%time score = np.mean(cross_val_score(forest, trainDataVecs, train['sentiment'], cv=10, scoring = 'roc_auc'))

Wall time: 44.7 s


In [13]:
score

0.90474156799999983

In [14]:
result = forest.predict(testDataVecs)

In [15]:
output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
output.to_csv('data/Word2Vec_AverageVectors_{0:.5f}.csv'.format(score), index = False, quoting = 3)

In [16]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

-114


1    12557
0    12443
Name: sentiment, dtype: int64