In [1]:
# -*- coding: UTF-8 -*-
from ckiptagger import WS
import pandas as pd
import pickle
from gensim.models.word2vec import Word2Vec

In [2]:
modelPath = 'C:/Users/wmmkslab/Desktop/Word2Vec/data'
# WS(斷詞)
ws = WS(modelPath)
newsPath = 'udnNews/news.xlsx'

In [3]:
# remove '\n', '\r'
def cleanNews(df):
    for i in range(1000):
        content = ''
        if df['Content'][i] == '會員專屬內容':
            continue
        if df['Content'][i] == '':
            continue
        for ch in df['Content'][i]:
            if ch != '\n' and ch != '\r':
                content += ch
        df['Content'][i] = content

In [4]:
def wsToFile(df):
    contentList = pd.DataFrame(data=None, columns=['Content'])
    for content in df['Content']:
        if content != '':
            data = pd.DataFrame(data=[{'Content': content}])
            contentList = contentList.append(data)
    # WS
    word_s = ws(contentList['Content'],
                sentence_segmentation=True,
                segment_delimiter_set={'?', '？', '!', '！', '。', ',',
                                       '，', ';', ':', '、', '／'})
    with open('input/ws.pkl', 'wb') as f:
        pickle.dump(word_s, f)

In [5]:
def readToDf():
    with open('input/ws.pkl', 'rb') as f:
        ws = pickle.load(f)
    
    return ws

In [6]:
if __name__ == '__main__':
    #newsDF = pd.read_excel(newsPath, usecols='B:F', keep_default_na=False)
    #cleanNews(newsDF)
    #wsToFile(newsDF)
    wsDf = readToDf()

In [7]:
model = Word2Vec(wsDf, size=100, min_count=1, sg=0, iter=5)
'''
size: 詞向量的維度大小，維度太小會無法有效表達詞與詞的關係，維度太大會使關係太稀疏而難以找出規則
(Kaggle比賽上常用的詞向量維度介於200到300之間)
'''
model_d250 = Word2Vec(wsDf, size=250, min_count=1, sg=0, iter=5)
'''
sg: sg=0時以CBOW來訓練，sg=1時以Skip-gram來訓練
我們這次作業的目標，是要對文章的主題分類作出預測
某些詞彙可能只出現在特定的主題當中，所以文本中可能有許多低頻詞
而在特性上，Skip-gram比CBOW通常對低頻詞有更好的訓練效果
'''
model_d250_sg = Word2Vec(wsDf, size=250, min_count=1, sg=1, iter=5)

In [8]:
def most_similar(w2v_model, words, topn=5):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [9]:
most_similar(model, ['會員', '專屬', '內容', '議員', '市長', '美國'])

Unnamed: 0,會員,cos,專屬,cos.1,內容,cos.2,議員,cos.3,市長,cos.4,美國,cos.5
0,委託,0.995864,會員,0.994811,邀請,0.998056,獵人,0.999609,協商,0.999161,中國,0.998087
1,內容,0.995554,訂閱,0.99158,昨,0.998026,事業,0.999574,取消,0.999137,湖人,0.996798
2,派出所,0.99554,清理,0.991257,計畫,0.997998,市民,0.9994,員警,0.998854,火燙,0.996095
3,中壢,0.995523,變焦,0.991051,論文,0.997973,國會,0.999303,提出,0.998786,受,0.995877
4,相機,0.995497,大隊,0.99099,線上,0.997946,感受,0.999271,報告,0.99874,球速,0.995804


In [10]:
most_similar(model_d250, ['會員', '專屬', '內容', '議員', '市長', '美國'])

Unnamed: 0,會員,cos,專屬,cos.1,內容,cos.2,議員,cos.3,市長,cos.4,美國,cos.5
0,專屬,0.998022,會員,0.998022,建商,0.999022,動物,0.999678,各界,0.999573,首先,0.998973
1,股份,0.997775,光學,0.996661,小組,0.99897,品質,0.999594,先生,0.999557,之間,0.998422
2,財政部,0.997534,變焦,0.996651,隨著,0.998952,綠,0.999592,報告,0.999507,湖人,0.998317
3,今,0.997394,院區,0.996629,農委會,0.998927,立委,0.999585,聲稱,0.99949,兒子,0.997843
4,饗宴,0.99738,水湳,0.996546,呼應,0.9989,酒,0.999561,疫苗,0.999475,球隊,0.997823


In [11]:
most_similar(model_d250_sg, ['會員', '專屬', '內容', '議員', '市長', '美國'])

Unnamed: 0,會員,cos,專屬,cos.1,內容,cos.2,議員,cos.3,市長,cos.4,美國,cos.5
0,專屬,0.982686,會員,0.982686,會員,0.901676,立委,0.981788,立委,0.932139,病毒,0.923523
1,內容,0.901676,電路,0.874152,專屬,0.869755,國民黨,0.976907,議員,0.927897,貿易,0.917043
2,電路,0.862454,內容,0.869755,透過,0.844781,國民黨團,0.954342,國民黨,0.9238,中國,0.913348
3,設備,0.859433,衛星,0.856202,生物,0.843258,法務部,0.936876,出席,0.914544,民主黨,0.910769
4,利用,0.859261,學系,0.851567,維護,0.829903,總召,0.935931,侯友宜,0.911235,冠肺炎,0.899109
