<a href="https://colab.research.google.com/github/zera888/tibaml1027/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 為了美觀 我把Future Warning關閉
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# 把 zip 解壓縮到 data 資料夾
import zipfile
import os
if not os.path.exists("data"):
    f = zipfile.ZipFile("./pttsmalldata.zip")
    f.extractall("./data")
    f.close()

In [None]:
import glob
import json
import pandas as pd
# 拿取所有 *.json 檔案
fs = glob.glob("./data/pttsmalldata/*.json")
# 我們只取用 內文 標題 和 型態(問卦...等等)
# 但實際上我們只使用 內文 做訓練而已
contents = []
titles = []
types = []
for fn in fs:
    with open(fn, "r", encoding="utf-8") as f:
        data = json.load(f)
        contents.append(data["post_content"])
        titles.append(data["post_title"])
        types.append(data["post_type"])
df = pd.DataFrame({
    "type":types,
    "title":titles,
    "content":contents
}, columns=["type", "title", "content"])       
df

In [None]:
# 由於訓練會花超級久時間, 挑選前十篇進行訓練
df = df.head(10)

In [None]:
# 使用 jieba 進行分詞
import jieba
import os
from urllib.request import urlretrieve

# 使用大型字典
big_dict_path = "dict.txt.big"
if not os.path.exists(big_dict_path):
    print("下載大型字典")
    url = "https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big"
    urlretrieve(url, big_dict_path)
jieba.set_dictionary(big_dict_path)

# 需加入一些鄉民常用字彙
ptt_dict_path = "ptt_dic.txt"
if os.path.exists(ptt_dict_path):
    print("載入ptt專用詞典")
    jieba.load_userdict(ptt_dict_path)

In [None]:
# 將標點符號去掉
punct = set(u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐､﹒﹔﹕﹖﹗﹚﹜﹞！），．：；？｜｝︴︶︸︺︼︾﹀﹂﹄﹏､～￠々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖（［｛￡

In [None]:
import re
content = df.iloc[0]["content"]
# 去掉網址 ptt的文章內容基本上都會換行 我們順便把最後的換行字元去掉
content = re.sub(r'https?:\/\/.*[\r\n]*', '', content)
# 使用 filter 去掉標點符號
content = " ".join(filter(lambda x: x not in punct, jieba.cut(content)))
# 去掉換行符號
content = content.replace("\n", "").replace("\r", "")
content

In [None]:
# 對表格的每一筆都做出轉換
def process(content):
    content = re.sub(r'https?:\/\/.*[\r\n]*', '', content)
    content = "".join(filter(lambda x: x not in punct, content))
    content = (" ".join(jieba.cut(content))
                  .replace("\n", "")
                  .replace("\r", ""))
    return content
content_cut = df["content"].apply(process)
content_cut

In [None]:
import numpy as np
np.random.seed(13)
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [None]:
# 太少字的句子我們去掉, 因為這樣無法移動
corpus = [sentence for sentence in content_cut if sentence.count(' ') >= 2]
# 將每一個詞轉成數字
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
# 統計訓練資料有多少個詞
nb_samples = sum(len(s) for s in corpus)
# 統計我們有多少種詞
V = len(tokenizer.word_index) + 1
# 降維成100
dim = 100
# 上文取2 下文取2
window_size = 2

In [None]:
def generate_data(corpus, window_size, V):
    # 拿最多 上文(2) + 下文(2)
    maxlen = window_size * 2
    # 拿出每一句
    for words in corpus:
        L = len(words)
        # 拿出每一個單字
        for index, word in enumerate(words):
            # contexts: [上文, 下文]
            # labels: 目標
            contexts = []
            labels   = []         
            # 上文的最前面座號
            s = index - window_size
            # 下文的最後面座號 
            e = index + window_size
            # range記得要多加1
            contexts.append([words[i] for i in range(s, e + 1) 
                                      if 0 <= i < L and i != index])
            labels.append(word)
            # 少於四個的前面補上0
            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            # 答案做出one-hot encoding
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [None]:
cbow = Sequential()
# 使用嵌入層來得到轉化過的語意
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
# 針對4個詞的100語意做平均
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
# 輸出判斷
cbow.add(Dense(V, activation='softmax'))

In [None]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# 訓練十個epochs
for i in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)
    print("-" * 15, "Iteration", i, "-" * 15)
    print(loss)

In [None]:
# 準備寫入我們的向量
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim))

In [None]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    # 存擋的時候必須是 2 3 1 這樣空白鍵在中間
    # 但是在用join時裡面必須是字串, 所以先用map轉換成字串
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [None]:
w2v.most_similar("版主")

In [None]:
w2v.most_similar("台北")