In [5]:
#DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
import jieba

In [87]:
#将单句拆分为字的list，并进行了非法字符过滤
def poem_cut(sentence):
    #@sentence：string
    #@return list of words stripped from sentence
    r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    r2 = '/[\u4e00-\u9fa5]+'
    #去掉虚词
    stopwords = '矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'
    sentence = re.sub(r1, '', sentence)
    sentence = re.sub(r2, '', sentence)
    
    result = [word.strip('，|）|：|{|}|“|”|.|！|?|。|□|·|（|\n|\r|\n\n|\xa0|\u3000').strip(stopwords) for word in sentence]
    
    return list(filter(None, result))

In [8]:
t = '1.云横秦岭家何在，雪拥蓝关马不前□。'

In [9]:
poem_cut(t)

['云', '横', '秦', '岭', '家', '在', '雪', '拥', '蓝', '关', '马', '前']

In [11]:
def readfile(path):
    #@path: stirng of data file path
    #return list of strings extracted from the txt file
    f = open(path, encoding='utf', errors='ignore')
    lines = f.readlines()
    data = []
    for line in lines:
        #去掉作者
        if re.match(r'^\[',line) is not None:
            continue
        data.append(line) 
    f.close()
    return data    

In [12]:
def data_preprocess(data_raw):
    #@data_raw: list of strings
    #return list of single word extracted from data_raw
    #获得单词的List
    data = []
    for sentence in data_raw:
        result = poem_cut(sentence)
        if result == []:
            continue
        for word in result:
            if word == ' ':
                continue
            data.append(word)
    return data

In [88]:
#jieba分词：输入单句，过滤非法字符
def jieba_clean_text(sentence):
    #@sentence: string
    #return string after being cleaned 
    r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！·[\\]^_`{|}~]+'
    r2 = '/[\u4e00-\u9fa5]+'
    #去掉虚词
    sentence = re.sub(r1, '', sentence)
    sentence = re.sub(r2, '', sentence)
    return sentence

In [76]:
def jieba_process(data):
    #@data: list of strings
    #return list of words extraced from strings and being filtered by stopwords
    stopword = '有|如|在|矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'
    result = []
    for sentence in data:
        clean_text = jieba_clean_text(sentence)
        string = ' '.join(jieba.cut(clean_text, cut_all = True))
        for word in string.split():
            word = word.strip('，|）|：|{|}|“|”|.|！|?|。|□|·|（|\n|\r|\n\n|\xa0|\u3000').strip(stopword)
            result.append(word)
    return list(filter(None,result))

In [92]:
def word_frequency(data):
    #@data: list of strings
    #return dict showing how many times a word appears in data
    word_dict={}
    for char in data:
        if char in word_dict:
            word_dict[char] = word_dict[char]+1
        else:
            word_dict[char] = 1
    return word_dict

In [93]:
def freq_sort(word_dict):
    #@word_dict: dict 
    #return list of sorted tuples from word_dict
    result = sorted(word_dict.items(), key=lambda d:d[1], reverse = True)
    return result

In [94]:
#虚词 停用词
# stopwords = '矣|哉|虽|既|而|何|乃|乎|其|且|若|所|为|焉|以|因|于|与|也|则|者|之|不|自|得|来|去|无|可|是|已|此|的|上|中|兮|一'

#文件路径
path_taihu = "./data/poem_taihu.txt"
path_huzhou = "./data/poem_huzhou.txt"
path_moushan = "./data/poem_moushan.txt"

In [95]:
data_taihu = readfile(path_taihu)
# data_huzhou = readfile(path_huzhou)
# data_moushan = readfile(path_moushan)

In [110]:
# test1 = data_taihu[0]
# print(poem_cut(test1))

In [97]:
data_Taihu = data_preprocess(data_taihu)

In [98]:
print(data_Taihu)

['浣', '溪', '沙', '乐', '在', '烟', '波', '钓', '闲', '草', '堂', '松', '桂', '胜', '攀', '梢', '梢', '新', '月', '几', '回', '弯', '碧', '太', '湖', '三', '万', '顷', '屹', '然', '相', '对', '洞', '庭', '山', '况', '风', '浪', '起', '须', '还', '水', '调', '歌', '头', '舣', '棹', '太', '湖', '岸', '舣', '棹', '太', '湖', '岸', '天', '水', '相', '连', '垂', '虹', '亭', '五', '年', '到', '故', '依', '然', '洗', '我', '征', '尘', '三', '斗', '快', '揖', '商', '飚', '千', '里', '鸥', '鹭', '亦', '翩', '翩', '身', '在', '水', '晶', '阙', '真', '作', '驭', '风', '仙', '望', '秋', '五', '日', '月', '还', '圆', '倚', '栏', '清', '啸', '孤', '发', '惊', '起', '壑', '龙', '眠', '欲', '酹', '鸱', '夷', '西', '子', '未', '办', '当', '年', '功', '业', '空', '击', '五', '湖', '船', '用', '知', '余', '事', '莼', '鲈', '正', '芳', '鲜', '偈', '颂', '百', '零', '二', '首', '西', '风', '簇', '浪', '花', '太', '湖', '连', '底', '冻', '冷', '照', '玉', '奁', '清', '片', '瑕', '缝', '面', '目', '分', '明', '眼', '睛', '定', '动', '墯', '虚', '凝', '裂', '万', '差', '漆', '桶', '漆', '桶', '湖', '州', '歌', '九', '十', '八', '首', '十', '太', '湖', '风', '卷', '浪', '头', '高', '锦', '柁', '摇', '摇',

In [111]:
dict_freq_taihu = word_frequency(data_Taihu)
word_freq_taihu = freq_sort(dict_freq_taihu)

In [112]:
print(word_freq_taihu)

[('湖', 179), ('太', 158), ('山', 97), ('风', 70), ('人', 67), ('水', 66), ('天', 64), ('月', 58), ('有', 53), ('石', 45), ('日', 43), ('三', 43), ('云', 41), ('万', 40), ('白', 39), ('江', 39), ('如', 38), ('清', 38), ('西', 37), ('在', 36), ('年', 35), ('生', 35), ('青', 34), ('明', 34), ('相', 34), ('千', 34), ('多', 32), ('头', 32), ('烟', 31), ('玉', 31), ('首', 30), ('吴', 30), ('秋', 29), ('高', 29), ('龙', 29), ('平', 29), ('时', 28), ('波', 28), ('花', 28), ('松', 27), ('古', 27), ('长', 27), ('夜', 27), ('空', 26), ('君', 26), ('东', 26), ('雪', 26), ('心', 25), ('浪', 25), ('里', 25), ('我', 25), ('然', 25), ('今', 25), ('入', 25), ('树', 24), ('色', 24), ('望', 24), ('事', 24), ('春', 24), ('舟', 24), ('尽', 24), ('看', 24), ('溪', 23), ('仙', 23), ('歌', 23), ('飞', 23), ('十', 22), ('垂', 22), ('五', 22), ('二', 22), ('谁', 22), ('客', 21), ('落', 21), ('回', 21), ('行', 21), ('南', 21), ('见', 21), ('游', 21), ('归', 21), ('过', 21), ('流', 21), ('新', 21), ('庭', 20), ('起', 20), ('未', 20), ('欲', 20), ('前', 20), ('碧', 20), ('出', 20), ('神', 19), ('草', 1

In [101]:
# l = sorted(word_freq_taihu.items(), key=lambda d:d[1], reverse = True)
# print(l)

In [102]:
# test1 = '小明硕士毕业于中国科学院计算所，后在日本京都大学深造'
# se = jieba_clean_text(data_taihu[1])
# seg_list = jieba.cut(se, cut_all = True)
# s = ' '.join(seg_list)
# word = []
# for w in s.split():
#     word.append(w)
# print(word)

In [103]:
jieba_taihu = list(jieba_process(data_taihu))

In [104]:
#print(jieba_taihu)

In [107]:
dict_jieba_taihu = word_frequency(jieba_taihu)
word_freq_jieba_taihu = freq_sort(dict_jieba_taihu)

In [108]:
print(word_freq_jieba_taihu)

[('太湖', 150), ('山', 42), ('人', 32), ('天', 30), ('月', 26), ('云', 26), ('多', 26), ('水', 25), ('风', 25), ('我', 24), ('君', 23), ('看', 23), ('时', 23), ('白', 21), ('石', 21), ('尽', 20), ('欲', 20), ('垂', 20), ('谁', 20), ('树', 19), ('烟', 19), ('波', 19), ('望', 19), ('事', 19), ('高', 19), ('似', 18), ('过', 18), ('日', 18), ('秋', 17), ('雪', 17), ('到', 17), ('洞庭', 16), ('玉', 16), ('相', 16), ('亦', 16), ('空', 16), ('棹', 16), ('吴', 16), ('见', 15), ('寒', 15), ('碧', 15), ('入', 15), ('青', 15), ('和', 14), ('舟', 14), ('生', 14), ('客', 14), ('虹', 14), ('浪', 14), ('寄', 14), ('清', 14), ('游', 14), ('归', 14), ('湖', 13), ('起', 13), ('松', 13), ('春', 13), ('处', 13), ('万顷', 13), ('灵', 13), ('还', 13), ('古', 13), ('作', 13), ('夜', 13), ('未', 12), ('宿', 12), ('杂', 12), ('歌', 12), ('下', 12), ('当', 12), ('江', 12), ('王', 12), ('仙', 12), ('孤', 12), ('知', 12), ('飞', 12), ('三', 12), ('今', 12), ('只', 12), ('新', 12), ('吾', 11), ('草', 11), ('回', 11), ('平', 11), ('朝', 11), ('明月', 11), ('前', 11), ('亭', 11), ('后', 11), ('出', 11), ('长'

In [113]:
#单独统计词语出现频率
two_words = []
for tu in word_freq_jieba_taihu:
    if len(tu[0]) > 1:
        two_words.append(tu)
print(two_words)

[('太湖', 150), ('洞庭', 16), ('万顷', 13), ('明月', 11), ('湖州', 9), ('姑苏', 8), ('十二', 8), ('二首', 8), ('扁舟', 8), ('湖岸', 7), ('三万', 7), ('西子', 7), ('吴江', 7), ('湖西', 7), ('松江', 7), ('三首', 7), ('水调歌头', 6), ('湖水', 6), ('百二', 6), ('平生', 6), ('千里', 6), ('西风', 6), ('湖东', 5), ('风吹', 5), ('二十', 5), ('落日', 5), ('千顷', 5), ('依然', 5), ('人生', 4), ('当年', 4), ('古今', 4), ('长桥', 4), ('青天', 4), ('百花', 4), ('灵岩寺', 4), ('玉山', 4), ('天下', 4), ('山色', 4), ('渺然', 4), ('画舫', 4), ('风月', 4), ('万里', 4), ('惊起', 4), ('几回', 4), ('倒影', 4), ('草堂', 4), ('岩寺', 4), ('蓬莱', 4), ('太湖石', 4), ('分明', 4), ('日月', 4), ('四十', 3), ('二十三', 3), ('潇洒', 3), ('古人', 3), ('荆溪', 3), ('烟雨', 3), ('五湖', 3), ('狂风', 3), ('十三', 3), ('湖滨', 3), ('青山', 3), ('蛟龙', 3), ('超然', 3), ('数峰', 3), ('龙王', 3), ('山前', 3), ('想见', 3), ('吹箫', 3), ('过长', 3), ('六千', 3), ('百二十', 3), ('玛瑙', 3), ('鱼龙', 3), ('万六', 3), ('忽然', 3), ('烟波', 3), ('斜阳', 3), ('月夜', 3), ('七十', 3), ('瀛洲', 3), ('太守', 3), ('风浪', 3), ('萧疏', 3), ('松陵', 3), ('缥缈', 3), ('仙人', 3), ('吴兴', 3), ('惠山', 3), ('相对', 3)

str