In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from bs4 import BeautifulSoup
import re
import time
from nltk.corpus import stopwords
import nltk.data
# import matplotlib.pyplot as plt
# import seaborn as sns

# %matplotlib inline



In [2]:
model = Word2Vec.load('300features_40minwords_10text')
model

<gensim.models.word2vec.Word2Vec at 0x2a6ebb7b5f8>

In [3]:
#word2vec 모델은 어휘의 각 단어에 대한 feature vector로 구성, syn0이라는 np배열로 저장(행 수는 모델 어휘의 수)
type(model.wv.syn0)


  


numpy.ndarray

In [4]:
model.wv.syn0.shape #행은 모델 어휘의 단어 수, 열은 특징 벡터의 크기

  """Entry point for launching an IPython kernel.


(11986, 300)

In [5]:
model.wv['flower'].shape #개별 단어 벡터 접근

(300,)

In [6]:
model.wv['flower'][:10]

array([-0.06458191, -0.15343866, -0.01922338, -0.07924391,  0.07760002,
       -0.0630917 ,  0.09446166,  0.02681448, -0.02802669, -0.00462039], dtype=float32)

<h1>K-Means</h1>

In [7]:
start = time.time()

word_vectors = model.wv.syn0 #어휘의 특징백터
num_clusters = word_vectors.shape[0] / 5 #k값은 어휘 크기의 1/5이나 평균 5단어로 설정
num_clusters = int(num_clusters)

kmeans_clustering = KMeans( n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end - start
print('time taken for k-means clustering: ', elapsed, 's')

  This is separate from the ipykernel package so we can avoid doing imports until


time taken for k-means clustering:  185.98822832107544 s


In [8]:
#각 어휘 단어를 클러스터 번호에 매핑되게 word / index 사전을 만든다
idx = list(idx)
names = model.wv.index2word
word_centroid_map = {names[i]: idx[i] for i in range(len(names))}

for cluster in range(10): #첫번째 클러스터에서 처음 10개를 찍어본다. -> 비슷한 단어가 군집화된것을 확인
    print("\nCluster {}".format(cluster))
    
    #클러스터 번호와 클러스터에 있는 단어를 찍는다
    words = []
    for i in range(0, len(list(word_centroid_map.values()))):
        if(list(word_centroid_map.values())[i] == cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['manuscript', 'journal', 'letter']

Cluster 1
['scruffi']

Cluster 2
['theo', 'malik', 'trey', 'fenton', 'platt', 'ditto']

Cluster 3
['tent', 'hut']

Cluster 4
['grit', 'testosteron', 'fill', 'brim', 'crackl']

Cluster 5
['serv', 'reinforc', 'apathi']

Cluster 6
['repetiti', 'repetit', 'occasion', 'overus', 'tire', 'jar', 'tiresom']

Cluster 7
['elisabeth', 'linda', 'fletcher', 'paquin', 'melissa', 'deneuv', 'bacal', 'nanci', 'kristin', 'sue', 'paig', 'kati', 'vicki', 'gina', 'selma', 'catherin', 'natasha', 'bateman', 'gwen', 'lisa', 'dian', 'bianca', 'sarandon', 'katherin', 'nicol', 'julia', 'becki', 'samantha', 'ami', 'stile', 'cathi', 'nina', 'hannah', 'fari', 'jill', 'susi', 'beth', 'mandi', 'debbi', 'gyllenha', 'drew', 'tammi', 'friel', 'melani', 'tina', 'lindsey', 'sara', 'morton', 'kudrow', 'hershey', 'megan', 'fiorentino', 'donna', 'zoe', 'daryl', 'lauren', 'christin', 'amanda', 'chelsea', 'bennet', 'dane', 'dickinson', 'karen', 'tilli', 'jenni', 'gail', 'heather']

In [9]:
train = pd.read_csv('data/labeledTrainData.tsv', 
                    header=0, delimiter='\t', quoting=3)
test = pd.read_csv('data/testData.tsv', 
                   header=0, delimiter='\t', quoting=3)

In [10]:
from preprocessing import preprocessing

clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(preprocessing.review_to_wordlist(review, remove_stopwords = True))

In [11]:
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(preprocessing.review_to_wordlist(review, remove_stopwords = True))

In [12]:
#bags of centroids 생성
train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32')
train_centroids[:5]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [13]:
#centroid는 두 클러스터의 중심점을 정의 한 다음 중심점의 거리를 측정한 것
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype = 'float32')
    
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
            
    return bag_of_centroids

In [14]:
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [20]:
test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32')

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [21]:
forest = RandomForestClassifier(n_estimators = 100)
print("Fitting a random-forest to labeled training data...")
%time forest = forest.fit(train_centroids, train['sentiment'])

Fitting a random-forest to labeled training data...
Wall time: 31.8 s


In [22]:
from sklearn.model_selection import cross_val_score
%time score = np.mean(cross_val_score(forest, train_centroids, train['sentiment'], cv=10, scoring='roc_auc'))

Wall time: 4min 36s


In [23]:
%time result = forest.predict(test_centroids)

Wall time: 1.26 s


In [24]:
score

0.91289043199999986

In [25]:
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("data/submit_BagOfCentroids_{0:.5f}.csv".format(score), index=False, quoting=3)

In [26]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

666


0    12833
1    12167
Name: sentiment, dtype: int64