In [1]:
#DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import jieba

In [3]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import word2vec

2020-09-12 18:34:09,111 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-12 18:34:09,112 : INFO : built Dictionary(12 unique tokens: ['computer', 'response', 'human', 'eps', 'system']...) from 9 documents (total 29 corpus positions)


In [21]:
#将单句拆分为字的list，并进行了非法字符过滤
def poem_cut(sentence):
    #@sentence：string
    #@return list of words stripped from sentence
    r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★◎、…【】《》？“”‘’！[\\]^_`{|}~]+'
    r2 = '/[\u4e00-\u9fa5]+'
    #去掉虚词
    stopwords = '矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'
    sentence = re.sub(r1, '', sentence)
    sentence = re.sub(r2, '', sentence)
    
    result = [word.strip('，|）|：|{|}|“|”|.|！|?|。|□|·|（|\n|\r|\n\n|\xa0|\u3000').strip(stopwords) for word in sentence]
    
    return list(filter(None, result))

In [22]:
t = '1.云横秦岭家何在，雪拥蓝关马不前□。'

In [23]:
poem_cut(t)

['云', '横', '秦', '岭', '家', '在', '雪', '拥', '蓝', '关', '马', '前']

In [24]:
def readfile(path):
    #@path: stirng of data file path
    #return list of strings extracted from the txt file
    f = open(path, encoding='utf', errors='ignore')
    lines = f.readlines()
    data = []
    for line in lines:
        #去掉作者
        if re.match(r'【', line) is not None:
            continue
        if re.match(r'^\[', line) is not None:
            continue
        data.append(line) 
    f.close()
    return data    

In [25]:
def data_preprocess(data_raw):
    #@data_raw: list of strings
    #return list of single word extracted from data_raw
    #获得单词的List
    data = []
    for sentence in data_raw:
        result = poem_cut(sentence)
        if result == []:
            continue
        for word in result:
            if word == ' ':
                continue
            data.append(word)
    return data

In [26]:
#jieba分词：输入单句，过滤非法字符
def jieba_clean_text(sentence):
    #@sentence: string
    #return string after being cleaned 
    r1 = '[a-zA-Z0-9’!"#$%&\'（）()*+,-./:;<=>?@，。?★◎、…【】《》？“”‘’！·[\\]^_`{|}~]+'
    r2 = '/[\u4e00-\u9fa5]+'
    #去掉虚词
    sentence = re.sub(r1, '', sentence)
    sentence = re.sub(r2, '', sentence)
    return sentence

In [27]:
def jieba_process(data):
    #@data: list of strings
    #return list of words extraced from strings and being filtered by stopwords
    stopword = '有|如|在|矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'
    result = []
    for sentence in data:
        clean_text = jieba_clean_text(sentence)
        string = ' '.join(jieba.cut(clean_text, cut_all = True))
        for word in string.split():
            word = word.strip('，|）|：|{|}|“|”|.|！|?|。|□|·|（|\n|\r|\n\n|\xa0|\u3000').strip(stopword)
            result.append(word)
    return list(filter(None,result))

In [28]:
def word_frequency(data):
    #@data: list of strings
    #return dict showing how many times a word appears in data
    word_dict={}
    for char in data:
        if char in word_dict:
            word_dict[char] = word_dict[char]+1
        else:
            word_dict[char] = 1
    return word_dict

In [37]:
def freq_sort(word_dict):
    #@word_dict: dict 
    #return list of sorted tuples from word_dict
    result = sorted(word_dict.items(), key=lambda d:d[1], reverse = True)
    return result

In [38]:
#虚词 停用词
# stopwords = '矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'

#文件路径
# path_taihu = "./data/poem_taihu.txt"
# path_huzhou = "./data/poem_huzhou.txt"
# path_moushan = "./data/poem_moushan.txt"
path_full = "./data/poem_full.txt"

In [39]:
# data_taihu = readfile(path_taihu)
# data_huzhou = readfile(path_huzhou)
# # data_moushan = readfile(path_moushan)
data_full = readfile(path_full)

In [40]:
# test1 = data_taihu[0]
# print(poem_cut(test1))
# print(data_huzhou)

In [41]:
# data_hu = data_taihu + data_huzhou
# print(data_hu)

In [42]:
# for i in range(len(data_hu)):
#     if type(data_hu[i]) is not str:
#         print(i)

In [43]:
# type(data_hu[660])

In [44]:
# data_Hu = data_preprocess(data_hu)
data_Full = data_preprocess(data_full)

In [55]:
# print(data_Full)

In [46]:
# dict_freq_hu = word_frequency(data_Hu)
# word_freq_hu = freq_sort(dict_freq_hu)
dict_freq_full = word_frequency(data_Full)
word_freq_full = freq_sort(dict_freq_full)

In [47]:
print(word_freq_full)

[('卷', 45244), ('人', 24361), ('山', 19522), ('日', 17429), ('风', 16067), ('云', 14104), ('有', 13634), ('春', 13168), ('天', 12935), ('白', 12836), ('花', 12240), ('月', 12074), ('时', 11914), ('水', 11724), ('相', 10911), ('长', 10794), ('君', 10160), ('归', 10097), ('年', 10058), ('秋', 9769), ('生', 9742), ('见', 9447), ('行', 9367), ('江', 9164), ('心', 9120), ('夜', 9017), ('知', 8983), ('如', 8821), ('下', 8163), ('李', 8132), ('高', 8008), ('空', 8004), ('清', 7957), ('南', 7745), ('在', 7718), ('明', 7612), ('三', 7582), ('子', 7519), ('门', 7358), ('里', 7240), ('客', 7119), ('未', 7063), ('事', 7040), ('金', 7025), ('处', 6973), ('道', 6961), ('作', 6826), ('送', 6767), ('青', 6738), ('玉', 6717), ('东', 6695), ('歌', 6683), ('多', 6682), ('寒', 6633), ('别', 6621), ('雨', 6557), ('王', 6471), ('落', 6412), ('家', 6391), ('十', 6374), ('居', 6354), ('声', 6347), ('城', 6298), ('远', 6223), ('千', 6186), ('朝', 6183), ('新', 6164), ('出', 6105), ('今', 6079), ('应', 6051), ('前', 6051), ('入', 6033), ('西', 6000), ('书', 5937), ('万', 5894), ('阳',

In [49]:
for pair in word_freq_full:
    if pair[0] in ['红','丹''朱','赤','绛','黑','暗','玄','乌','冥','墨','绿','碧','翠','苍','白','素','皎','皓']:
        print(pair[0] + ': ' + str(pair[1]))

白: 12836
红: 4180
碧: 2848
绿: 2807
翠: 2559
苍: 2120
暗: 1745
玄: 1417
素: 1318
皎: 985
冥: 965
乌: 899
赤: 806
黑: 524
墨: 347
皓: 306
绛: 306


In [101]:
# l = sorted(word_freq_taihu.items(), key=lambda d:d[1], reverse = True)
# print(l)

In [102]:
# test1 = '小明硕士毕业于中国科学院计算所，后在日本京都大学深造'
# se = jieba_clean_text(data_taihu[1])
# seg_list = jieba.cut(se, cut_all = True)
# s = ' '.join(seg_list)
# word = []
# for w in s.split():
#     word.append(w)
# print(word)

In [50]:
# jieba_hu = list(jieba_process(data_hu))
jieba_full = list(jieba_process(data_full))

Building prefix dict from the default dictionary ...
2020-09-12 18:39:31,242 : DEBUG : Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
2020-09-12 18:39:32,403 : DEBUG : Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.287 seconds.
2020-09-12 18:39:32,532 : DEBUG : Loading model cost 1.287 seconds.
Prefix dict has been built successfully.
2020-09-12 18:39:32,534 : DEBUG : Prefix dict has been built successfully.


In [51]:
# dict_jieba_hu = word_frequency(jieba_hu)
# word_freq_jieba_hu = freq_sort(dict_jieba_hu)
dict_jieba_full = word_frequency(jieba_full)
word_freq_jieba_full = freq_sort(dict_jieba_full)

In [52]:
print(word_freq_jieba_full)

[('卷', 43451), ('人', 10814), ('归', 8306), ('时', 8136), ('见', 7838), ('云', 7832), ('君', 7281), ('日', 6872), ('春', 6741), ('山', 6732), ('月', 6397), ('知', 6163), ('作', 5879), ('风', 5595), ('秋', 5468), ('送', 5465), ('花', 5328), ('多', 5278), ('空', 5179), ('寄', 5164), ('我', 5095), ('客', 5079), ('李', 5062), ('尽', 5051), ('还', 5010), ('生', 4979), ('未', 4922), ('欲', 4832), ('处', 4816), ('长', 4681), ('心', 4561), ('寒', 4536), ('应', 4467), ('行', 4439), ('闻', 4413), ('事', 4404), ('远', 4366), ('入', 4362), ('水', 4345), ('高', 4321), ('和', 4317), ('下', 4315), ('相', 4308), ('夜', 4283), ('将', 4281), ('天', 4269), ('玉', 4261), ('清', 4250), ('谁', 4219), ('更', 4075), ('独', 3967), ('同', 3950), ('年', 3920), ('看', 3912), ('别', 3855), ('从', 3809), ('向', 3800), ('飞', 3799), ('新', 3767), ('思', 3755), ('朝', 3731), ('愁', 3730), ('诗', 3719), ('出', 3700), ('复', 3697), ('道', 3694), ('树', 3687), ('望', 3632), ('闲', 3616), ('犹', 3608), ('成', 3496), ('张', 3485), ('深', 3473), ('似', 3468), ('声', 3458), ('歌', 3446), ('王', 341

In [53]:
#单独统计词语出现频率
two_words = []
for tu in word_freq_jieba_full:
    if len(tu[0]) > 1:
        two_words.append(tu)
print(two_words)

[('居易', 2699), ('白居易', 2696), ('万里', 1486), ('千里', 1332), ('二首', 1255), ('杜甫', 1189), ('春风', 1180), ('今日', 1169), ('白云', 1145), ('长安', 1041), ('李白', 973), ('故人', 944), ('明月', 935), ('二十', 890), ('人间', 889), ('风吹', 848), ('刘禹锡', 788), ('惆怅', 783), ('相思', 778), ('使君', 776), ('秋风', 769), ('少年', 751), ('十二', 743), ('悠悠', 740), ('江南', 708), ('白日', 708), ('青山', 688), ('十年', 682), ('相逢', 644), ('将军', 633), ('三十', 633), ('员外', 633), ('皎然', 631), ('乐章', 614), ('洛阳', 600), ('平生', 599), ('天子', 596), ('孟郊', 596), ('黄金', 595), ('寂寞', 594), ('韩愈', 590), ('天地', 589), ('主人', 589), ('应物', 585), ('春日', 577), ('友人', 570), ('司空', 568), ('陆龟蒙', 566), ('陆龟', 566), ('流水', 564), ('李商隐', 562), ('韦应物', 556), ('曲歌辞', 554), ('杨柳', 552), ('别离', 546), ('回首', 545), ('白发', 543), ('刘长卿', 541), ('洞庭', 537), ('行人', 522), ('今朝', 522), ('相公', 520), ('日暮', 513), ('落日', 511), ('日月', 511), ('杜牧', 506), ('桃花', 501), ('皇甫', 493), ('南山', 491), ('应制', 488), ('天下', 488), ('芳草', 485), ('夕阳', 476), ('相见', 473), ('风雨', 472), ('芙蓉', 

In [142]:
# with open('./data/poem_taihu.txt',encoding='utf-8') as f1:
#     with open('./data/poem_huzhou.txt',encoding='utf-8') as f2:
#         document1 = f1.read()
#         document2 = f2.read()
#         document = document1 + document2
#         document = jieba_clean_text(document)
#         document_cut = jieba.cut(document)
#         result = ' '.join(document_cut)
#         with open('./hu_segment.txt', 'w',encoding="utf-8") as f3:
#             f3.write(result)

In [54]:
single_word = ''
for sentence in data_full:
    stopword = '[有如在矣哉虽既而何乃乎其且若所为焉以因于与也则者之不自得来去无可是已此的上中兮一二三四五六七八九十]'
    sentence = re.sub(stopword, '', sentence)
    clean_text = jieba_clean_text(sentence)
    single_word += ' '.join(clean_text)
with open('./full_segment.txt', 'w', encoding="utf-8") as f:
    f.write(single_word)

In [180]:
print(single_word)

浣 溪 沙 
乐 烟 波 钓 闲 草 堂 松 桂 胜 攀 梢 梢 新 月 几 回 弯 碧 太 湖 万 顷 屹 然 相 对 洞 庭 山 况 风 浪 起 须 还 

水 调 歌 头 舣 棹 太 湖 岸 
舣 棹 太 湖 岸 天 水 相 连 垂 虹 亭 年 到 故 依 然 洗 我 征 尘 斗 快 揖 商 飚 千 里 鸥 鹭 亦 翩 翩 身 水 晶 阙 真 作 驭 风 仙 
望 秋 日 月 还 圆 倚 栏 清 啸 孤 发 惊 起 壑 龙 眠 欲 酹 鸱 夷 西 子 未 办 当 年 功 业 空 击 湖 船 用 知 余 事 莼 鲈 正 芳 鲜 

偈 颂 百 零 首 
西 风 簇 浪 花 太 湖 连 底 冻 冷 照 玉 奁 清 片 瑕 缝 
面 目 分 明 眼 睛 定 动 墯 虚 凝 裂 万 差 漆 桶 漆 桶 

湖 州 歌 首   
太 湖 风 卷 浪 头 高 锦 柁 摇 摇 坐 牢 
靠 著 篷 窗 垂 两 目 船 头 船 尾 烂 弓 刀 
山 居 杂 诗 首 
英 石 多 斧 凿 司 石 殊 怯 脆 拳 然 太 湖 出 始 岩 壑 意 
弹 窝 数 峰 绿 欲 仇 池 对 永 念 灵 壁 巧 嵌 空 劳 梦 寐 

点 绛 唇 莫 倚 高 楼 
莫 倚 高 楼 太 湖 西 畔 青 山 近 雁 边 云 暝 目 力 随 天 尽 落 日 平 芜 点 点 余 烽 烬 西 风 紧 乱 沙 成 阵 双 蓬 鬓 

江 城 子 春 江 打 头 风 
春 江 打 头 风 吼 层 空 卷 飞 蓬 多 少 云 涛 雪 浪 暮 江 早 客 情 多 感 慨 烟 漠 漠 雨 濛 濛 梁 溪 只 太 湖 东 长 儿 童 学 庞 翁 谁 信 家 书 月 曾 通 见 说 浙 河 金 鼓 震 日 到 羡 归 鸿 

浪 淘 沙 歌 阕 斗 清 新 
歌 阕 斗 清 新 檀 板 初 匀 画 堂 新 筑 太 湖 滨 好 黄 花 开 应 候 聊 宴 亲 宾 客 即 逢 辰 况 青 春 林 开 宴 锡 尧 尊 今 夜 素 娥 真 解 事 偏 向 人 明 


偈 颂 百 首 
年 年 动 乾 坤 岁 旧 岁 新 新 旧 般 时 节 
苍 卞 山 风 骨 露 太 湖 水 彻 底 波 冷 冷 落 落 哆 哆 和 和 
门 外 雪 消 春 水 滑 庭 前 玉 立 桂 婆 娑 

偈 颂 百 首 
人 打 毡 拍 板 

In [56]:
#加载语料
sentences = word2vec.LineSentence('./full_segment.txt')

In [57]:

#训练语料
path = get_tmpfile("word2vec_full.model") #创建临时文件
model = word2vec.Word2Vec(sentences, hs=1,min_count=2,window=5,size=100)
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")


2020-09-12 18:43:12,528 : INFO : collecting all words and their counts
2020-09-12 18:43:12,541 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-12 18:43:12,609 : INFO : PROGRESS: at sentence #10000, processed 149727 words, keeping 4395 word types
2020-09-12 18:43:12,678 : INFO : PROGRESS: at sentence #20000, processed 309873 words, keeping 5022 word types
2020-09-12 18:43:12,754 : INFO : PROGRESS: at sentence #30000, processed 467044 words, keeping 5354 word types
2020-09-12 18:43:12,831 : INFO : PROGRESS: at sentence #40000, processed 623946 words, keeping 5524 word types
2020-09-12 18:43:12,898 : INFO : PROGRESS: at sentence #50000, processed 784461 words, keeping 5826 word types
2020-09-12 18:43:12,962 : INFO : PROGRESS: at sentence #60000, processed 937029 words, keeping 5942 word types
2020-09-12 18:43:13,027 : INFO : PROGRESS: at sentence #70000, processed 1089981 words, keeping 6240 word types
2020-09-12 18:43:13,088 : INFO : PROGRESS: at senten

In [64]:
for key in model.wv.similar_by_word('欢', topn =20):
    print(key)

('杯', 0.49271366000175476)
('赏', 0.46698978543281555)
('朋', 0.4548138976097107)
('酺', 0.43664655089378357)
('酣', 0.43172651529312134)
('觞', 0.43096309900283813)
('衎', 0.43088775873184204)
('暌', 0.4266669750213623)
('醺', 0.42573082447052)
('情', 0.42114078998565674)
('醒', 0.4159899950027466)
('忻', 0.4045770764350891)
('筵', 0.39687591791152954)
('兹', 0.39510977268218994)
('娱', 0.39131465554237366)
('戚', 0.3837992548942566)
('时', 0.3796413242816925)
('酌', 0.37621524930000305)
('僚', 0.3732311725616455)
('交', 0.3624686896800995)
