# the application of word2vec

In [1]:
from gensim.models import word2vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/Users/yoga/Documents/GitHub/aiaRNN/class_text_mining'

In [2]:
## load word2vec model
model = word2vec.Word2Vec.load('word2vec_model/CBOW')

## similarity

In [3]:
## get most similarity with given words
## 可以替換關鍵字
model.wv.most_similar('KMT')

[('國民黨', 0.6417558789253235),
 ('DPP', 0.6396889686584473),
 ('kmt', 0.6338934898376465),
 ('dpp', 0.6175625920295715),
 ('民進黨', 0.5718200206756592),
 ('某黨', 0.5689526796340942),
 ('政黨', 0.5646135807037354),
 ('在野黨', 0.564574122428894),
 ('兩黨', 0.5438055992126465),
 ('黨內', 0.5420957803726196)]

In [4]:
## get most similarity with given words's relationship
## 可以替換關鍵字
model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])

[('異端', 0.38107287883758545),
 ('英雄難過', 0.37355032563209534),
 ('兵家', 0.37187737226486206),
 ('kmter', 0.3690451383590698),
 ('滅族', 0.36561936140060425),
 ('先祖', 0.364986389875412),
 ('漢奸', 0.36482545733451843),
 ('老北', 0.36260485649108887),
 ('搧動', 0.3609076142311096),
 ('明末', 0.3589898645877838)]

## clustering

In [5]:
## create a dictionary: words as key ; count as values
words = {word: vocab.count for word, vocab in model.wv.vocab.items()}

In [6]:
## sort and select the top 10000 count of words
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words = words[:10000]
words = np.array(words)[:, 0]
words

array(['人', '八卦', '有沒有', ..., '會辦', '專利', '三百'],
      dtype='<U20')

In [7]:
## extract the word vectors 
vecs = model.wv[words]

In [8]:
## run clustering algorithm
kmeans = KMeans(n_clusters=50)
cluster = kmeans.fit_predict(vecs)

In [9]:
## print the result
df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T
df.head(n=5)

Unnamed: 0,words,no. cluster
0,人,23
1,八卦,46
2,有沒有,46
3,說,23
4,好,23


In [10]:
## print every cluster of words
data = pd.concat([d['words'].reset_index(drop=True).rename(columns={0: k}) for k, d in df.groupby('no. cluster')], axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,冷氣,電影,後,喝,my,看到,禁止,今天,世界,寫,...,妹妹,神,穿,iPhone,台灣,這種,八卦,買,住,死
1,停電,看過,中,飲料,看板,發現,要求,時間,社會,上面,...,睡,seafood,衣服,之內,中國,這是,有沒有,賣,附近,抓
2,發電,拍,前,咖啡,Gossiping,時,規定,月,生活,照片,...,尻,宗教,黑,被刪,日本,話,from,元,家,砲
3,反核,故事,已經,茶,a,突然,進行,每天,一種,處理,...,睡覺,信,一件,自刪,美國,懂,Sent,便宜,外面,爆
4,台電,好看,年,酒,the,走,單位,一天,認為,找到,...,身體,師父,內褲,兩則,國家,笑,問卦,廣告,我家,一群
5,環保,經典,新,奶茶,of,聽到,人員,小時,重要,資料,...,貓貓,信徒,戴,ASUS,韓國,罵,JPTT,貴,蓋,殺
6,限電,片,發生,一杯,稅後,站,法,幾天,能力,一張,...,天氣,教主,t,Asus,國,相信,請,價格,房間,搶
7,核能,人物,過去,牛奶,I,裡,未,晚上,未來,電話,...,牠,師傅,穿著,HTC,臺灣,英文,請問,一家,房子,喊
8,核電,畫面,當時,冰,by,旁邊,計畫,準備,選擇,提供,...,眼睛,信仰,頭髮,Sony,大陸,意思,卦,服務,間,攻擊
9,缺電,劇情,當初,杯,you,小心,同意,上班,所有,查,...,舒服,妙禪,顏色,Samsung,國際,名字,鄉民,品質,一間,派


In [None]:
## 可以嘗試調整 Kmeans 的參數 , etc. 分個 100 羣如何 ?