In [121]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [122]:
List = ['good', 'nice','great','love','like','consider','awsome','recommend','well',
        'amaze','generous','delightful','comfortable','absolutely','flesh','tasty','delicious','better','excellent','wonderful']
center_class = {}
def clusters(word_vectors):
    
    model = KMeans(n_clusters=2, max_iter= 10000, random_state=True, n_init=100).fit(X=word_vectors.vectors)
    
    K_0 = model.cluster_centers_[0]
    K_1 = model.cluster_centers_[1]
    dic = {}
    for item in List:
        if item in word_vectors.vocab.keys():
             dic[item] = word_vectors[item]
        
    frame = pd.DataFrame(dic.keys())
    frame.columns = ['words']
    frame['vectors'] = frame.words.apply(lambda x: dic[x])
    frame['distance'] = frame.vectors.apply(lambda x: model.transform([x]))
    frame['K_0_distance'] = frame.distance.apply(lambda x: x[0][0])
    frame['K_1_distance'] = frame.distance.apply(lambda x: x[0][1])
    num0 = frame.K_0_distance.sum()
    num1 = frame.K_1_distance.sum()
    if num0 < num1: 
        center_class[0] = 1
        center_class[1] = -1
    else:
        center_class[0] = -1
        center_class[1] = 1
    
    return model

In [123]:
def sentiment_score(word_vectors, model):
    
    words = pd.DataFrame(word_vectors.vocab.keys())
    words.columns = ['words']
    words['vectors'] = words.words.apply(lambda x: word_vectors[x])
    words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
    words.cluster = words.cluster.apply(lambda x: x[0])
    words['cluster_value'] = [center_class[i] for i in words.cluster]
    words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
    words['sentiment_coeff'] = words.closeness_score * words.cluster_value
    sentiment_dict  = words.set_index('words')['sentiment_coeff'].to_dict()
 
    return sentiment_dict

In [124]:

def main():
    
    word_vectors = Word2Vec.load('./imdb.d2v').wv
    model = clusters(word_vectors)
    sentiment_dict = sentiment_score(word_vectors,model)
    f = open('sentiment_dict.txt','w')
    for key in sentiment_dict:
        f.write(str(key) + " " + str(sentiment_dict[key]) + '\n')
    f.close()
    print('end')

In [125]:
main()

end
