In [1]:
#import common librarie
from zipfile import ZipFile 
import os.path
from os import path
import pandas as pd
import numpy as np
import base64
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns



## Load Data

In [3]:
file_name = "poems_with_tags.zip"
poem_df = pd.read_csv('poems_with_tags.zip', compression='zip', header=0, quotechar='"')
poem_df.head()

Unnamed: 0,content,dynasty,author,tags,star,author_stars,title,tags_list
0,《吴都赋》云：“户藏烟浦，家具画船。”唯吴兴为然。春游之盛，西湖未能过也。己酉岁，予与萧时父...,宋代,姜夔,春游;怀人;,64,279,琵琶仙·《吴都赋》云：「户藏烟浦,"['春游', '怀人']"
1,《廿一史弹词》第三段说秦汉开场词滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几...,明代,杨慎,咏史;抒怀;人生;哲理,3244,131,临江仙·滚滚长江东逝水,"['咏史', '抒怀', '人生', '哲理']"
2,《水经》云：“彭蠡之口有石钟山焉。”郦元以为下临深潭，微风鼓浪，水石相搏，声如洪钟。是说也，...,宋代,苏轼,古文观止;纪游;写景;写山,306,4011,石钟山记,"['古文观止', '纪游', '写景', '写山']"
3,【序】辛亥之冬，予载雪诣石湖。止既月，授简索句，且征新声，作此两曲。石湖把玩不已，使工妓隶习...,宋代,姜夔,咏物;梅花,74,279,暗香疏影,"['咏物', '梅花']"
4,〔一枝花〕 攀出墙朵朵花，折临路枝枝柳。花攀红蕊嫩，柳折翠条柔，浪子风流。凭着我折柳攀花手...,元代,关汉卿,;散曲;抒情;生活,140,124,【南吕】一枝花不伏老,"['散曲', '抒情', '生活']"


In [4]:
poem_df['first_tag'] = poem_df \
.apply(lambda row : row.tags_list[1:-1]
       .replace("'","")
       .replace(",", "")
       .split(" ")[0], axis=1)

## Split Train and Test Data

In [5]:
poems_info = poem_df[['content','dynasty','author','title']]
tags = poem_df[['first_tag', 'tags_list']]
X_train, X_test, y_train, y_test = train_test_split(poems_info, tags, test_size=0.33)

In [6]:
X_train.head()

Unnamed: 0,content,dynasty,author,title
402,乱飘僧舍茶烟湿，密洒歌楼酒力微。江上晚来堪画处，渔人披得一蓑归。,唐代,郑谷,雪中偶题
1749,宿醉离愁慢髻鬟，六铢衣薄惹轻寒，慵红闷翠掩青鸾。罗袜况兼金菡萏，雪肌仍是玉琅玕，骨香腰细更沈檀。,唐代,韩偓,浣溪沙·宿醉离愁慢髻鬟
5551,锦江滑腻蛾眉秀，幻出文君与薛涛。言语巧偷鹦鹉舌，文章分得凤凰毛。纷纷辞客多停笔，个个公卿欲梦...,唐代,元稹,寄赠薛涛
4361,稽山罢雾郁嵯峨，镜水无风也自波。莫言春度芳菲尽，别有中流采芰荷。,唐代,贺知章,相和歌辞·采莲曲
2837,晓披烟雾入青峦，山寺疏钟万木寒。千古河流成沃野，几年沙势自风湍。水穿石甲龙鳞动，日绕峰头佛顶...,明代,王守仁,登大伾山诗


In [7]:
y_train.head()

Unnamed: 0,first_tag,tags_list
402,写雪,"['写雪', '写人']"
1749,婉约,"['婉约', '写景', '荷花']"
5551,才女,['才女']
4361,写景,['写景']
2837,登高,"['登高', '写山', '抒怀', '爱国', '壮志']"


## Tokenizing


In [9]:
from transformers import BertTokenizer, RobertaTokenizer, DistilBertTokenizer


In [10]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…




In [17]:
bert_tokenizer.tokenize("湖上与张先同赋")

['湖', '上', '与', '张', '先', '同', '赋']

In [18]:
from jiayan import load_lm
from jiayan import CharHMMTokenizer
lm = load_lm('jiayan.klm')
tokenizer = CharHMMTokenizer(lm)

In [19]:
print(list(tokenizer.tokenize("湖上与张先同赋")))


['湖上', '与张', '先', '同', '赋']


In [20]:

def tokenize_poem(some_list, f):
    return [f(x) for x in some_list]
X_train_tokenized = tokenize_poem(X_train['content'], lambda x: " ".join(list(bert_tokenizer.tokenize(x))))
X_test_tokenized = tokenize_poem(X_test['content'], lambda x: " ".join(list(bert_tokenizer.tokenize(x))))
count_vect = CountVectorizer()

In [21]:
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_train['first_tag'])
y_test_encoded = Encoder.fit_transform(y_test['first_tag'])

In [22]:
X_train_tokenized[0]

'乱 飘 僧 舍 茶 烟 湿 ， 密 洒 歌 楼 酒 力 微 。 江 上 晚 来 堪 画 处 ， 渔 人 披 得 一 蓑 归 。'

In [23]:
X_train_cv = count_vect.fit_transform(X_train_tokenized)
X_test_cv = count_vect.transform(X_test_tokenized)

In [24]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train['first_tag'])
predictions = naive_bayes.predict(X_test_cv)

In [25]:
predictions

array(['写景', '写景', '写景', ..., '写景', '写景', '写景'], dtype='<U11')

In [26]:
print('Accuracy score: ', accuracy_score(y_test['first_tag'], predictions))
print('Precision score: ', precision_score(y_test['first_tag'], predictions, average='weighted'))
print('Recall score: ', recall_score(y_test['first_tag'], predictions, average='weighted'))

Accuracy score:  0.06980198019801981
Precision score:  0.004872316439564749
Recall score:  0.06980198019801981


  _warn_prf(average, modifier, msg_start, len(result))
