# 有望客字詞關聯圖 - Word2Vec

In [100]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim import models
import ast
from collections import Counter
from pyvis.network import Network

In [101]:
df = pd.read_csv('./data/sentiment/nissan_clean_data.csv')
df = df[['system_id', 'words']]

In [102]:
df['words'] = df['words'].apply(lambda x: ast.literal_eval(x))
df['words'] = df['words'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,system_id,words
0,1,新聞 小休 熱鬧 好玩 PEUGEOT300815LBlueH
1,2,情報 2020 11 月份 臺灣汽車 市場 銷售 報告 新增 小七車
2,3,新聞 豐田 PremioAllionPriusAlpha 明年 停產 標題 房車 轎式 MP...
3,4,缺點 選車 重點 new juke 試駕 影片 慢慢 釋出 8891 影片 結尾 依舊 提出
4,5,菜單 NissanAllNEWSentra 尊爵型 Nissan2020All New Se...
...,...,...
2074,2459,購車 休旅車 HondaToyotaLexusNissan SUV 1800cc 以內 20...
2075,2461,售車 Nissan2013BIGTIIDA5 NISSAN 20131 BIG TIIDA ...
2076,2462,售車 Nissankicks 智行 旗艦版 2019 2019 06 1498 顏色 排檔 ...
2077,2463,售車 自售 NISSANTEANAJ31200520 2005 06 J31 TEANA 顏...


In [103]:
df.to_csv('./data/word2vec/nissan/nissan_w2v.txt', sep='\t', index=False, header=True, encoding='utf-8')


### 訓練模型

In [104]:
# 讀取已斷詞的資料集
sentences = []
with open('./data/word2vec/nissan/nissan_w2v.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # 假設每行是以空白分隔的詞彙
        words = line.strip().split()
        sentences.append(words)
model = word2vec.Word2Vec(sentences, sg=1, window=6, min_count=30, workers=4, vector_size=250)

# 保存模型，供日後使用
model.save("./data/word2vec/nissan/nissan_word2vec.model")

查看相關詞彙

In [105]:
# 可以使用model.wv.most_similar()來查詢相似詞彙
similar_words = model.wv.most_similar('altis', topn=10)
for word, similarity in similar_words:
    print(word, similarity)

FOCUS 0.8952024579048157
camry 0.8786022067070007
不輸 0.8738648295402527
買馬 0.8736023306846619
yaris 0.8659805059432983
這代 0.8627099394798279
焦點 0.8607915639877319
lancer 0.8596556782722473
唯一 0.8513107299804688
結案 0.8478981852531433


### 依據資料集的詞頻去找相關字詞

詞頻計算，只保留`前100個`最相關的字詞

In [106]:
# 將所有的詞彙合併為一個大字串
all_words = ' '.join(df['words'])

In [107]:
# 使用Counter計算詞頻
word_counts = Counter(all_words.split())

count_df = pd.DataFrame.from_dict(word_counts, orient='index').reset_index()
count_df.columns = ['word', 'freq']
count_df.sort_values(ascending=False, by='freq',inplace=True)

In [108]:
count_df = count_df.head(100)

count_df

Unnamed: 0,word,freq
474,問題,1497
317,空間,1019
312,價格,851
97,業務,812
61,原廠,776
...,...,...
276,內裝,246
921,歐洲,245
3325,朋友,242
1162,外觀,240


建立字詞correlation

In [109]:
df_most_correlation = pd.DataFrame(columns=['item1', 'item2', 'correlation'])

定義相關字詞函式

In [110]:
def get_top_similar_word(word, n):
    similar_words = model.wv.most_similar(word, topn=n)
    return similar_words

In [111]:
for index, row in count_df.iterrows():
    try:
        topn = get_top_similar_word(row['word'], 10)
    except:
        print(row['word'])
        continue
    for item in topn:
        tmp = {'item1': row['word'], 'item2': item[0], 'correlation': str(item[1])}
        df_most_correlation = pd.concat([df_most_correlation, pd.DataFrame([tmp])], ignore_index=True)


In [112]:
df_most_correlation

Unnamed: 0,item1,item2,correlation
0,問題,解決,0.7881562113761902
1,問題,類似,0.7175934910774231
2,問題,還沒,0.7058594822883606
3,問題,車有,0.6784587502479553
4,問題,經驗,0.6759433746337891
...,...,...,...
995,希望,板友,0.7746965289115906
996,希望,給我,0.7688392400741577
997,希望,附上,0.7683296203613281
998,希望,禮拜,0.7641984224319458


In [113]:
df_most_correlation.to_csv('./data/word2vec/nissan/nissan_correlation.csv', encoding='utf-8')

### 視覺化呈現

In [136]:
network = pd.read_csv('./data/word2vec/nissan/nissan_correlation.csv')

In [141]:
network

Unnamed: 0.1,Unnamed: 0,item1,item2,correlation
0,0,問題,解決,0.788156
1,1,問題,類似,0.717593
2,2,問題,還沒,0.705859
3,3,問題,車有,0.678459
4,4,問題,經驗,0.675943
...,...,...,...,...
995,995,希望,板友,0.774697
996,996,希望,給我,0.768839
997,997,希望,附上,0.768330
998,998,希望,禮拜,0.764198


In [138]:
type(network.loc[1,'correlation'])

numpy.float64

In [150]:
network[(network['correlation'] > 0.71) & (network['correlation'] < 0.72)]

Unnamed: 0.1,Unnamed: 0,item1,item2,correlation
1,1,問題,類似,0.717593
217,217,福特,日系,0.719341
218,218,福特,崩潰,0.712695
232,232,國產,2011,0.719264
233,233,國產,進口車,0.717419
234,234,國產,中型,0.710311
278,278,日產,2005,0.718606
279,279,日產,2007,0.711088
369,369,需要,決定,0.717355
424,424,Re,發表,0.713933


In [152]:
network2 = network[(network['correlation'] > 0.55) & (network['correlation'] < 0.74)]
# network2 = network2.reset_index(drop=True)
network2

Unnamed: 0.1,Unnamed: 0,item1,item2,correlation
1,1,問題,類似,0.717593
2,2,問題,還沒,0.705859
3,3,問題,車有,0.678459
4,4,問題,經驗,0.675943
5,5,問題,無關,0.673555
...,...,...,...,...
859,859,推薦,有車,0.739394
936,936,心得,選車,0.708780
937,937,心得,昨天,0.706544
938,938,心得,文章,0.697163


#### Pyvis網路圖

In [116]:
Cor_Graph = {}
correlation_net = Network(height='550px',width="100%")
nid=1

In [117]:
# 加入 node
for i in network['item1']:
    if (i not in Cor_Graph.keys()): 
        correlation_net.add_node(n_id=nid, 
                            label=i, 
                            )
        Cor_Graph[i] = nid
        nid += 1

for i in network['item2']:
        if (i not in Cor_Graph.keys()): 
            correlation_net.add_node(n_id=nid, 
                                label=i, 
                                )
            Cor_Graph[i] = nid
            nid += 1

# 加入 edge
for i, row in network.iterrows():
    # print(row['item1'])
    correlation_net.add_edge(Cor_Graph[row['item1']], Cor_Graph[row['item2']], weight=row['correlation'], title=row['correlation'], value=row['correlation'])


In [118]:
correlation_net.save_graph('./html_files/nissan_w2v.html')