# 有望客字詞關聯圖 - Word2Vec

In [1]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim import models
import ast
from collections import Counter
from pyvis.network import Network

In [2]:
df = pd.read_csv('./data/sentiment/nissan_clean_data.csv')
df = df[['system_id', 'words']]

In [3]:
df['words'] = df['words'].apply(lambda x: ast.literal_eval(x))
df['words'] = df['words'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,system_id,words
0,1,新聞 小休 熱鬧 好玩 PEUGEOT300815LBlueH
1,2,情報 2020 11 月份 臺灣汽車 市場 銷售 報告 新增 小七車
2,3,新聞 豐田 PremioAllionPriusAlpha 明年 停產 標題 房車 轎式 MP...
3,4,缺點 選車 重點 new juke 試駕 影片 慢慢 釋出 8891 影片 結尾 依舊 提出
4,5,菜單 NissanAllNEWSentra 尊爵型 Nissan2020All New Se...
...,...,...
2074,2459,購車 休旅車 HondaToyotaLexusNissan SUV 1800cc 以內 20...
2075,2461,售車 Nissan2013BIGTIIDA5 NISSAN 20131 BIG TIIDA ...
2076,2462,售車 Nissankicks 智行 旗艦版 2019 2019 06 1498 顏色 排檔 ...
2077,2463,售車 自售 NISSANTEANAJ31200520 2005 06 J31 TEANA 顏...


In [4]:
df.to_csv('./data/word2vec/nissan/nissan_w2v.txt', sep='\t', index=False, header=True, encoding='utf-8')


### 訓練模型

In [5]:
# 讀取已斷詞的資料集
sentences = []
with open('./data/word2vec/nissan/nissan_w2v.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # 假設每行是以空白分隔的詞彙
        words = line.strip().split()
        sentences.append(words)
model = word2vec.Word2Vec(sentences, sg=1, window=6, min_count=30, workers=4, vector_size=250)

# 保存模型，供日後使用
model.save("./data/word2vec/nissan/nissan_word2vec.model")

查看相關詞彙

In [6]:
# 可以使用model.wv.most_similar()來查詢相似詞彙
similar_words = model.wv.most_similar('altis', topn=10)
for word, similarity in similar_words:
    print(word, similarity)

焦點 0.8884727358818054
FOCUS 0.8857243061065674
不輸 0.8750547170639038
這代 0.8691617250442505
哪台 0.8635929226875305
佛心 0.8542863726615906
買馬 0.8442227840423584
camry 0.8440051674842834
結案 0.8421916365623474
沒力 0.8360468745231628


### 依據資料集的詞頻去找相關字詞

詞頻計算，只保留`前100個`最相關的字詞

In [7]:
# 將所有的詞彙合併為一個大字串
all_words = ' '.join(df['words'])

In [8]:
# 使用Counter計算詞頻
word_counts = Counter(all_words.split())

count_df = pd.DataFrame.from_dict(word_counts, orient='index').reset_index()
count_df.columns = ['word', 'freq']
count_df.sort_values(ascending=False, by='freq',inplace=True)

In [9]:
count_df = count_df.head(100)

count_df

Unnamed: 0,word,freq
474,問題,1497
317,空間,1019
312,價格,851
97,業務,812
61,原廠,776
...,...,...
276,內裝,246
921,歐洲,245
3325,朋友,242
1162,外觀,240


建立字詞correlation

In [10]:
df_most_correlation = pd.DataFrame(columns=['item1', 'item2', 'correlation'])

定義相關字詞函式

In [11]:
def get_top_similar_word(word, n):
    similar_words = model.wv.most_similar(word, topn=n)
    return similar_words

In [12]:
for index, row in count_df.iterrows():
    try:
        topn = get_top_similar_word(row['word'], 10)
    except:
        print(row['word'])
        continue
    for item in topn:
        tmp = {'item1': row['word'], 'item2': item[0], 'correlation': str(item[1])}
        df_most_correlation = pd.concat([df_most_correlation, pd.DataFrame([tmp])], ignore_index=True)


In [13]:
df_most_correlation

Unnamed: 0,item1,item2,correlation
0,問題,解決,0.7724317312240601
1,問題,幾天,0.6958567500114441
2,問題,類似,0.6942410469055176
3,問題,機率,0.6894322633743286
4,問題,還沒,0.6886370778083801
...,...,...,...
995,希望,來信,0.7825292348861694
996,希望,附上,0.7821965217590332
997,希望,照片,0.7769221663475037
998,希望,禮拜,0.76861572265625


In [14]:
df_most_correlation.to_csv('./data/word2vec/nissan/nissan_correlation.csv', encoding='utf-8')

### 視覺化呈現

In [37]:
network = pd.read_csv('./data/word2vec/nissan/nissan_correlation.csv')

In [38]:
network

Unnamed: 0.1,Unnamed: 0,item1,item2,correlation
0,0,問題,解決,0.772432
1,1,問題,幾天,0.695857
2,2,問題,類似,0.694241
3,3,問題,機率,0.689432
4,4,問題,還沒,0.688637
...,...,...,...,...
995,995,希望,來信,0.782529
996,996,希望,附上,0.782197
997,997,希望,照片,0.776922
998,998,希望,禮拜,0.768616


In [39]:
type(network.loc[1,'correlation'])

numpy.float64

In [51]:
network2 = network[(network['correlation'] > 0.75) & (network['correlation'] < 0.8)]

In [52]:
# 將所有的詞彙合併為一個大字串
all_nodes = pd.unique(network2[['item1', 'item2']].values.ravel())

len(all_nodes)
# all_nodes

292

#### Pyvis網路圖

In [116]:
Cor_Graph = {}
correlation_net = Network(height='550px',width="100%")
nid=1

In [117]:
# 加入 node
for i in network['item1']:
    if (i not in Cor_Graph.keys()): 
        correlation_net.add_node(n_id=nid, 
                            label=i, 
                            )
        Cor_Graph[i] = nid
        nid += 1

for i in network['item2']:
        if (i not in Cor_Graph.keys()): 
            correlation_net.add_node(n_id=nid, 
                                label=i, 
                                )
            Cor_Graph[i] = nid
            nid += 1

# 加入 edge
for i, row in network.iterrows():
    # print(row['item1'])
    correlation_net.add_edge(Cor_Graph[row['item1']], Cor_Graph[row['item2']], weight=row['correlation'], title=row['correlation'], value=row['correlation'])


In [118]:
correlation_net.save_graph('./html_files/nissan_w2v.html')