In [23]:
import numpy as np
import pandas as pd
import os
import sys
import math
import torch
import time
import re
import cv2
import shutil
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
from torchvision import transforms
from icecream import ic
import tensorflow as tf 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
np.set_printoptions(precision=2,suppress=True)

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(device_type='GPU', devices=gpus[0])

In [2]:
# ! wget http://aimaksen.bslience.cn/poetrySong.txt

--2022-03-05 03:02:16--  http://aimaksen.bslience.cn/poetrySong.txt
Resolving aimaksen.bslience.cn (aimaksen.bslience.cn)... 140.249.60.152, 240e:944:1:e00:3::3fc
Connecting to aimaksen.bslience.cn (aimaksen.bslience.cn)|140.249.60.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56092736 (53M) [text/plain]
Saving to: ‘poetrySong.txt’


2022-03-05 03:03:13 (956 KB/s) - ‘poetrySong.txt’ saved [56092736/56092736]



In [10]:
!ls

Sources       poetrySong.txt  test03.ipynb  test08.ipynb  test12.ipynb
bbc-text.csv  test01.ipynb    test04.ipynb  test10.ipynb  test9.ipynb
english       test02.ipynb    test05.ipynb  test11.ipynb


In [2]:
txtfile = Path("poetrySong.txt")

In [17]:
with open(txtfile) as f:
    for i, line in enumerate(f.readlines()):
        if i < 10:
            print(line)

太白山祈雨诗五首  其一::苏辙::田漫漫，耕挹挹。拔陈草，生九谷。人功尽，雨则违。苗不穗，莩不米，哀将饥兮。

太白山祈雨诗五首  其二::苏辙::山岩岩，奠南西。嗟我民，匪神依。伐山木，蓺稷黍。求既多，诉不已，犹我许兮。

太白山祈雨诗五首  其三::苏辙::山为灰，石为炭。水泉沸，百草烂。神予我，旱夺之。孰为是，骄不威，尚可弛兮。

太白山祈雨诗五首  其四::苏辙::雷冯空，雨腾渊。诛孽妖，反丰年。顾千里，瞬三日。神在堂，龙为役，是何惜兮。

太白山祈雨诗五首  其五::苏辙::雨既止，百谷复。筑场壤，治囷簏。为酒醴，伐豚羔。舞长袖，击鸣鼍，匪以报兮。

舜泉诗::苏辙::历山岩岩，虞舜宅焉。虞舜徂矣，其神在天。其德在人，其物在泉。神不可亲，德用不知。有冽斯泉，下民是祗。泉流无疆，有永我思。源发于山，施于北河。播于中逵，汇为澄波。有鳖与鱼，有菱与荷。蕴毒是泄，污浊以流。埃{土盍}消亡，风火灭收。丛木敷荣，劳者所休。谁为旱灾，靡物不伤。天地耗竭，泉亦沦亡。民咸不宁，曰不享耶。时雨既澍，百谷既登。有流泫然，弥坎而升。沟洫满盈，𫚥黾沸腾。匪泉实来，帝实顾余。执其羔豚，蘋藻是菹。帝今在堂，泉复如初。

筠州圣祖殿诗  其一::苏辙::高安在南，分自豫章。重山复江，鱼鸟之乡。俗野不文，吏亦怠荒。礼失不知，习为旧常。

筠州圣祖殿诗  其二::苏辙::于穆圣祖，宅神皇极。降鉴在下，子孙千亿。羽衣玉佩，旗纛旄节。巍巍煌煌，秩祀万国。

筠州圣祖殿诗  其三::苏辙::如日在天，靡国不临。筠虽小邦，其有不歆。东庑西向，谁昔营之。民昏不知，神以不怀。

筠州圣祖殿诗  其四::苏辙::深山之间，野水之滨。礼乐声明，孰见孰闻。祖庙之严，君臣则存。失而不图，民以罔观。



In [3]:
df = pd.read_csv(str(txtfile), sep="::", header=None)
df.columns = ['cipai', 'author', 'content']

In [5]:
df.head()

Unnamed: 0,cipai,author,content
0,太白山祈雨诗五首 其一,苏辙,田漫漫，耕挹挹。拔陈草，生九谷。人功尽，雨则违。苗不穗，莩不米，哀将饥兮。
1,太白山祈雨诗五首 其二,苏辙,山岩岩，奠南西。嗟我民，匪神依。伐山木，蓺稷黍。求既多，诉不已，犹我许兮。
2,太白山祈雨诗五首 其三,苏辙,山为灰，石为炭。水泉沸，百草烂。神予我，旱夺之。孰为是，骄不威，尚可弛兮。
3,太白山祈雨诗五首 其四,苏辙,雷冯空，雨腾渊。诛孽妖，反丰年。顾千里，瞬三日。神在堂，龙为役，是何惜兮。
4,太白山祈雨诗五首 其五,苏辙,雨既止，百谷复。筑场壤，治囷簏。为酒醴，伐豚羔。舞长袖，击鸣鼍，匪以报兮。


In [23]:
len(df)

254237

In [4]:
# nan
df = df.dropna()
len(df)

254214

In [5]:
contents = df['content'].values.tolist()

In [28]:
contents[:30]

['田漫漫，耕挹挹。拔陈草，生九谷。人功尽，雨则违。苗不穗，莩不米，哀将饥兮。',
 '山岩岩，奠南西。嗟我民，匪神依。伐山木，蓺稷黍。求既多，诉不已，犹我许兮。',
 '山为灰，石为炭。水泉沸，百草烂。神予我，旱夺之。孰为是，骄不威，尚可弛兮。',
 '雷冯空，雨腾渊。诛孽妖，反丰年。顾千里，瞬三日。神在堂，龙为役，是何惜兮。',
 '雨既止，百谷复。筑场壤，治囷簏。为酒醴，伐豚羔。舞长袖，击鸣鼍，匪以报兮。',
 '历山岩岩，虞舜宅焉。虞舜徂矣，其神在天。其德在人，其物在泉。神不可亲，德用不知。有冽斯泉，下民是祗。泉流无疆，有永我思。源发于山，施于北河。播于中逵，汇为澄波。有鳖与鱼，有菱与荷。蕴毒是泄，污浊以流。埃{土盍}消亡，风火灭收。丛木敷荣，劳者所休。谁为旱灾，靡物不伤。天地耗竭，泉亦沦亡。民咸不宁，曰不享耶。时雨既澍，百谷既登。有流泫然，弥坎而升。沟洫满盈，𫚥黾沸腾。匪泉实来，帝实顾余。执其羔豚，蘋藻是菹。帝今在堂，泉复如初。',
 '高安在南，分自豫章。重山复江，鱼鸟之乡。俗野不文，吏亦怠荒。礼失不知，习为旧常。',
 '于穆圣祖，宅神皇极。降鉴在下，子孙千亿。羽衣玉佩，旗纛旄节。巍巍煌煌，秩祀万国。',
 '如日在天，靡国不临。筠虽小邦，其有不歆。东庑西向，谁昔营之。民昏不知，神以不怀。',
 '深山之间，野水之滨。礼乐声明，孰见孰闻。祖庙之严，君臣则存。失而不图，民以罔观。',
 '毛侯始来，其则有意。匪民之愚，礼教实坠。章闻于朝，帝曰俞哉。弗改弗营，何以示民。',
 '九峰之杉，逍遥之柟。易直且修，弗斫而堪。新堂有严，四星在南。朝廷之仪，万民所祗。',
 '还朝正三伏，一再趋未央。久从江海游，苦此剑佩长。梦中惊和璞，起坐怜老房。为我忝丞辖，置身愿并凉。此心一自许，何暇忧陟冈。早岁发归念，老来未尝忘。渊明不久仕，黔娄足为康。家有二顷田，岁办十口粮。教敕诸子弟，编排旧文章。辛勤养松竹，迟莫多风霜。常恐先着鞭，独引社酒尝。火急报君恩，会合心则降。',
 '卧对郗人气已真，晚依丘壑更无伦。不须复预清言侣，自是江东第一人。',
 '亟往遄归真旷哉，聋人不信有惊雷。虽云不必见安道，已误扁舟犯雪来。',
 '失脚来游九陌尘，故溪何日定抽身。便同贺老扁舟去，已笑西山郑子真。',
 '雨细风斜欲暝时，凌波一叶去安归。遥知夜宿蛟人室，浪卷波分不着衣。',
 '老去那

# 读取词表

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer(char_level=True, oov_token="<OOV>")

In [8]:
tokenizer.fit_on_texts(contents)

In [21]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

10111

In [10]:
tokenizer.texts_to_sequences(contents)[:1]

[[309,
  548,
  548,
  2,
  713,
  2193,
  2193,
  3,
  1767,
  621,
  143,
  2,
  18,
  226,
  476,
  3,
  5,
  298,
  127,
  2,
  47,
  1026,
  1335,
  3,
  1690,
  4,
  2701,
  2,
  5246,
  4,
  1327,
  2,
  905,
  153,
  702,
  234,
  3]]

In [12]:
input_sentences = []
for line in contents:
    tokenlist = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokenlist)):
        if np.random.randint(10):continue
        ngram_sentence = tokenlist[:i+1]
        input_sentences.append(ngram_sentence)
#     break
# tokenlist
len(input_sentences)
# 15120466

1509632

In [None]:
# del input_sentences

In [11]:
np.random.randint(3)

2

AttributeError: 'list' object has no attribute 'shape'

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
input_squences = pad_sequences(input_sentences, padding='pre')
input_squences.shape

(1509632, 2396)

In [25]:
xs = input_squences[:, :-1]
labels = input_squences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [29]:
xs = xs[:10000]
ys = ys[:10000]
xs.shape, ys.shape


((10000, 2395), (10000, 10111))

In [19]:
xs[0]


array([  0,   0,   0, ..., 127,   2,  47], dtype=int32)

In [20]:
label[0]

1026

# 构建模型

In [27]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, 100), 
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(150)),
        tf.keras.layers.Dense(vocab_size, activation="sigmoid"),
    ]
)

2022-03-05 07:07:39.852129: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-05 07:07:54.810948: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 44769 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:89:00.0, compute capability: 8.6
2022-03-05 07:07:54.813547: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 44769 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:b1:00.0, compute capability: 8.6


In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1011100   
                                                                 
 bidirectional (Bidirectiona  (None, 300)              226800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 10111)             3043411   
                                                                 
Total params: 4,281,311
Trainable params: 4,281,311
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(
    loss=tf.losses.categorical_crossentropy,
    optimizer='Adam',
    metrics=['accuracy']
)

In [35]:
history = model.fit(xs, ys, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
