In [3]:
#import common librarie
from zipfile import ZipFile 
import os.path
from os import path
import pandas as pd
import numpy as np
import base64
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns



In [4]:
#import libraries to handle classic chinese word
from jiayan import load_lm
from jiayan import CharHMMTokenizer

# Baseline Model

## Load Data

In [5]:
file_name = "poems_with_new_tags.zip"
poem_df = pd.read_csv('poems_with_new_tags.zip', compression='zip', header=0, quotechar='"')
poem_df.head()

Unnamed: 0,content,dynasty,author,tags,star,author_stars,title,tags_list,new_tags,new_first_tag,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
0,《吴都赋》云：“户藏烟浦，家具画船。”唯吴兴为然。春游之盛，西湖未能过也。己酉岁，予与萧时父...,宋代,姜夔,春游;怀人;,64,279,琵琶仙·《吴都赋》云：「户藏烟浦,"['春游', '怀人']",政治;游玩,政治,0,0,0,0,0,1,0,1
1,《廿一史弹词》第三段说秦汉开场词滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几...,明代,杨慎,咏史;抒怀;人生;哲理,3244,131,临江仙·滚滚长江东逝水,"['咏史', '抒怀', '人生', '哲理']",写物;劝勉;悲苦;政治,写物,1,1,0,0,1,1,0,0
2,《水经》云：“彭蠡之口有石钟山焉。”郦元以为下临深潭，微风鼓浪，水石相搏，声如洪钟。是说也，...,宋代,苏轼,古文观止;纪游;写景;写山,306,4011,石钟山记,"['古文观止', '纪游', '写景', '写山']",写物;政治;游玩,写物,1,0,0,0,0,1,0,1
3,【序】辛亥之冬，予载雪诣石湖。止既月，授简索句，且征新声，作此两曲。石湖把玩不已，使工妓隶习...,宋代,姜夔,咏物;梅花,74,279,暗香疏影,"['咏物', '梅花']",劝勉,劝勉,0,1,0,0,0,0,0,0
4,〔一枝花〕 攀出墙朵朵花，折临路枝枝柳。花攀红蕊嫩，柳折翠条柔，浪子风流。凭着我折柳攀花手...,元代,关汉卿,;散曲;抒情;生活,140,124,【南吕】一枝花不伏老,"['散曲', '抒情', '生活']",写物;快乐;悲苦,写物,1,0,0,1,1,0,0,0


## Aggregate Tags Type

too many tags cause the prediction too wide to predict the type of the poem 

In [5]:
tags_list = poem_df['tags'].tolist()

In [35]:
tags_set = set()
for tags in tags_list:
    tag_split_list = tags.strip(';').split(';')
    for tag in tag_split_list:
        if len(tag) > 1:
            tags_set.add(tag)

In [6]:
import synonyms


smart_open library not found; falling back to local-filesystem-only
[jieba] default dict file path ../data/vocab.txt
[jieba] default dict file path ../data/vocab.txt
[jieba] load default dict ../data/vocab.txt ...
[jieba] load default dict ../data/vocab.txt ...
>> Synonyms load wordseg dict [/home/lindayang16/anaconda3/lib/python3.8/site-packages/synonyms/data/vocab.txt] ... 
>> Synonyms on loading stopwords [/home/lindayang16/anaconda3/lib/python3.8/site-packages/synonyms/data/stopwords.txt] ...
[Synonyms] on loading vectors [/home/lindayang16/anaconda3/lib/python3.8/site-packages/synonyms/data/words.vector.gz] ...


In [68]:
# print("咏物: {}".format(synonyms.nearby("咏物"))) # 获取近义词
# synonyms.nearby("伤情")
synonyms.compare("往事", "忆昔")
# print("NOT_EXIST: {}".format((synonyms.nearby("NOT_EXIST"))))



0.189

In [None]:
smaller_tags_set = set()
smaller_tags_set.add('伤情')
error_tags = set()
for tag in tags_set:
    if tag not in smaller_tags_set:
        found_similar_tag = False
        for unique_tag in smaller_tags_set:
            if len(synonyms.nearby(tag)[0]) > 0:
                score = synonyms.compare(unique_tag, tag) 
                if score > 0.6:
                    found_similar_tag = True
                    break
            else:
                error_tags.add(tag)
                found_similar_tag = True
                print("error added {}".format(tag))
                break
        if not found_similar_tag:
            smaller_tags_set.add(tag)

In [8]:
poem_df['first_tag'] = poem_df \
.apply(lambda row : row.tags_list[1:-1]
       .replace("'","")
       .replace(",", "")
       .split(" ")[0], axis=1)

## Split Train and Test Data

In [6]:
poems_info = poem_df[['content','dynasty','author','title']]
tags = poem_df.drop(['content','dynasty','author','title','star','author_stars'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(poems_info, tags, test_size=0.33)

In [7]:
print(len(X_train)==len(y_train))

True


In [8]:
X_train.head()

Unnamed: 0,content,dynasty,author,title
4919,萧萧山路穷秋雨，淅淅溪风一岸蒲。为问寒沙新到雁，来时还下杜陵无。,唐代,杜牧,秋浦途中
4546,绿叶阴浓，遍池亭水阁，偏趁凉多。海榴初绽，朵朵簇红罗。老燕携雏弄语，有高柳鸣蝉相和。骤雨过，...,金朝,元好问,骤雨打新荷
5335,送人犹未苦，苦送春、随人去天涯。片红都飞尽，正阴阴润绿，暗里啼鸦。赋情顿雪双鬓，飞梦逐尘沙。...,宋代,吴文英,忆旧游·别黄澹翁
2310,惟汉廿二世，所任诚不良。沐猴而冠带，知小而谋强。犹豫不敢断，因狩执君王。白虹为贯日，己亦先受...,魏晋,曹操,薤露
2459,携手江村。梅雪飘裙。情何限、处处消魂。故人不见，旧曲重闻。向望湖楼，孤山寺，涌金门。寻常行处...,宋代,苏轼,行香子·丹阳寄述古


In [9]:
y_train.head()

Unnamed: 0,tags,tags_list,new_tags,new_first_tag,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
4919,思乡,['思乡'],悲苦,悲苦,0,0,0,0,1,0,0,0
4546,写景;抒怀,"['写景', '抒怀']",劝勉;政治,劝勉,0,1,0,0,0,1,0,0
5335,写景;离别;愁绪,"['写景', '离别', '愁绪']",悲苦;政治;朋友,悲苦,0,0,0,0,1,1,1,0
2310,乐府;感叹;悼念,"['乐府', '感叹', '悼念']",悲苦;政治,悲苦,0,0,0,0,1,1,0,0
2459,;写景;追忆;思念;友人,"['写景', '追忆', '思念', '友人']",劝勉;悲苦;政治,劝勉,0,1,0,0,1,1,0,0


## Tokenizing

In [10]:
lm = load_lm('jiayan.klm')

In [11]:
from jiayan import WordNgramTokenizer

In [12]:
#Naive Bayes
tokenizer = CharHMMTokenizer(lm)
# print(list(tokenizer.tokenize(text)))


def tokenize_poem(some_list, f):
    result = []
    for i, x in enumerate(some_list):
        if i % 100 == 0:
            print(i)
            print(x)
        chinese_chars = re.findall(r'[\u4e00-\u9fff]+', sample):
        chinese_string = "".join()
        result.append(f(x))
    return result
X_train_tokenized = tokenize_poem(X_train['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
print("get there")
X_test_tokenized = tokenize_poem(X_test['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
count_vect = CountVectorizer()


0
萧萧山路穷秋雨，淅淅溪风一岸蒲。为问寒沙新到雁，来时还下杜陵无。
100
北斗七星高，哥舒夜带刀。至今窥牧马，不敢过临洮。
200
憔悴年来甚，萧条益自伤。风威侵病骨，雨气咽愁肠。夜鼎唯煎药，朝髭半染霜。前缘竟何似，谁与问空王。
300
不是爱风尘，似被前缘误。花落花开自有时，总赖东君主。去也终须去，住也如何住！若得山花插满头，莫问奴归处。
400
人间无阿童，犹唱水中龙。白草侵烟死，秋藜绕地红。古书平黑石，神剑断青铜。耕势鱼鳞起，坟科马鬣封。菊花垂湿露，棘径卧干蓬。松柏愁香涩，南原几夜风！
500
羞日遮罗袖，愁春懒起妆。易求无价宝，难得有心郎。枕上潜垂泪，花间暗断肠。自能窥宋玉，何必恨王昌？
600
英雄立马起沙陀，奈此朱梁跋扈何。只手难扶唐社稷，连城犹拥晋山河。风云帐下奇儿在，鼓角灯前老泪多。萧瑟三垂冈下路，至今人唱《百年歌》。
700
大夫击东胡，胡尘不敢起。胡人山下哭，胡马海边死。部曲尽公侯，舆台亦朱紫。当时有勋业，末路遭谗毁。转旆燕赵间，剖符括苍里。弟兄莫相见，亲族远枌梓。不改青云心，仍招布衣士。平生怀感激，本欲候知己。去矣难重陈，飘然自兹始。游梁且未遇，适越今何以。乡山西北愁，竹箭东南美。峥嵘缙云外，苍莽几千里。旅雁悲啾啾，朝昏孰云已。登临多瘴疠，动息在风水。虽有贤主人，终为客行子。我携一尊酒，满酌聊劝尔。劝尔惟一言，家声勿沦滓。
800
候晓逾闽嶂，乘春望越台。宿云鹏际落，残月蚌中开。薜荔摇青气，桄榔翳碧苔。桂香多露裛，石响细泉回。抱叶玄猿啸，衔花翡翠来。南中虽可悦，北思日悠哉。鬒发俄成素，丹心已作灰。何当首归路，行剪故园莱。
900
一叶扁舟轻帆卷。暂泊楚江南岸。孤城暮角，引胡笳怨。水茫茫，平沙雁、旋惊散。烟敛寒林簇，画屏展。天际遥山小，黛眉浅。旧赏轻抛，到此成游宦。觉客程劳，年光晚。异乡风物，忍萧索、当愁眼。帝城赊，秦楼阻，旅魂乱。芳草连空阔，残照满。佳人无消息，断云远。
1000
闻道双衔凤带，不妨单著鲛绡。夜香知与阿谁烧。怅望水沈烟袅。云鬓风前绿卷，玉颜醉里红潮。莫教空度可怜宵。月与佳人共僚。
1100
长乐宫连上苑春，玉楼金殿艳歌新。君门一入无由出，唯有宫莺得见人。
1200
远岸收残雨。雨残稍觉江天暮。拾翠汀洲人寂静，立双双鸥鹭。望几点、渔灯隐映蒹葭浦。停画桡、两两舟人语。道去程今夜，遥指前村烟树。游宦成羁旅。短樯吟倚闲凝伫。万水千山

500
一春不识西湖面。翠羞红倦。雨窗和泪摇湘管。意长笺短。知心惟有雕梁燕。自来相伴。东风不管琵琶怨。落花吹遍。
600
五夜光寒，照来积雪平于栈。西风何限，自起披衣看。对此茫茫，不觉成长叹。何时旦，晓星欲散，飞起平沙雁。
700
金炉犹暖麝煤残。惜香更把宝钗翻。重闻处，余熏在，这一番、气味胜从前。背人偷盖小蓬山。更将沈水暗同然。且图得，氤氲久，为情深、嫌怕断头烟。
800
忆昔午桥桥上饮，坐中多是豪英。长沟流月去无声。杏花疏影里，吹笛到天明。二十余年如一梦，此身虽在堪惊。闲登小阁看新晴。古今多少事，渔唱起三更。
900
艳骨已成兰麝土，宫墙依旧压层崖。弩台雨坏逢金镞，香径泥销露玉钗。砚沼只留溪鸟浴，屟廊空信野花埋。姑苏麋鹿真闲事，须为当时一怆怀。
1000
太史公曰：“先人有言：‘自周公卒五百岁而有孔子。孔子卒后至于今五百岁，有能绍明世、正《易传》，继《春秋》、本《诗》、《书》、《礼》、《乐》之际？’”意在斯乎！意在斯乎！小子何敢让焉！　　上大夫壶遂曰：“昔孔子何为而作《春秋》哉”？太史公曰：“余闻董生曰：‘周道衰废，孔子为鲁司寇，诸侯害子，大夫雍之。孔子知言之不用，道之不行也，是非二百四十二年之中，以为天下仪表，贬天子，退诸侯，讨大夫，以达王事而已矣。’子曰：‘我欲载之空言，不如见之于行事之深切著明也。’夫《春秋》，上明三王之道，下辨人事之纪，别嫌疑，明是非，定犹豫，善善恶恶，贤贤贱不肖，存亡国，继绝世，补弊起废，王道之大者也。《易》著天地、阴阳、四时、五行，故长于变；《礼》经纪人伦，故长于行；《书》记先王之事，。故长于政；《诗》记山川、溪谷、禽兽、草木、牝牡、雌雄，故长于风；《乐》乐所以立，故长于和；《春秋》辨是非，故长于治人。是故《礼》以节人，《乐》以发和，《书》以道事，《诗》以达意，《易》以道化，《春秋》以道义。拨乱世反之正，莫近于《春秋》。《春秋》文成数万，其指数千。万物之散聚皆在《春秋》。《春秋》之中，弑君三十六，亡国五十二，诸侯奔走不得保其社稷者不可胜数。察其所以，皆失其本已。故《易》曰‘失之毫厘，差之千里。’故曰‘臣弑君，子弑父，非一旦一夕之故也，其渐久矣’。故有国者不可以不知《春秋》，前有谗而弗见，后有贼而不知。为人臣者不可以不知《春秋》，守经事而不知其宜，遭变事而不知其权。为人君父而不通于《春秋》之义者，必蒙首恶之名。为人臣子而不通于《春秋》之

In [13]:
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_train['new_first_tag'])
y_test_encoded = Encoder.fit_transform(y_test['new_first_tag'])

In [14]:
X_train_tokenized[0]

'萧萧 山路 穷 秋雨 ， 淅淅 溪 风 一 岸 蒲 。 为 问 寒沙 新 到 雁 ， 来时 还 下 杜陵 无 。'

In [15]:
X_train_cv = count_vect.fit_transform(X_train_tokenized)
X_test_cv = count_vect.transform(X_test_tokenized)

In [42]:
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(Corpus['text_final'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)

## NB Model

### Build Model

In [43]:
# word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
# top_words_df = pd.DataFrame(word_freq.sum()).sort_values(0, ascending=False)

In [78]:
naive_bayes = MultinomialNB()


### Train the Model

In [79]:
naive_bayes.fit(X_train_cv, y_train['new_first_tag'])
predictions = naive_bayes.predict(X_test_cv)

In [80]:
predictions

array(['写物', '悲苦', '写物', ..., '写物', '家庭', '写物'], dtype='<U2')

In [81]:
print('Accuracy score: ', accuracy_score(y_test['new_first_tag'], predictions))
print('Precision score: ', precision_score(y_test['new_first_tag'], predictions, average='weighted'))
print('Recall score: ', recall_score(y_test['new_first_tag'], predictions, average='weighted'))

Accuracy score:  0.40029761904761907
Precision score:  0.32712915681378885
Recall score:  0.40029761904761907


  _warn_prf(average, modifier, msg_start, len(result))


## SVM Model

### Build the Model

In [36]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

### Train the Model

In [37]:
SVM.fit(X_train_cv, y_train['new_first_tag'])
predictions_SVM = SVM.predict(X_test_cv)

### Evaluation

In [38]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test['new_first_tag'])*100)

SVM Accuracy Score ->  34.25742574257426


## Other Models

In [14]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import LogisticRegression


In [16]:
y_train_powerset = y_train.drop(['tags','tags_list','new_tags','new_first_tag'], axis=1)
y_test_powerset = y_test.drop(['tags','tags_list','new_tags','new_first_tag'], axis=1)

In [17]:
len(y_train_powerset)

4091

In [18]:
len(X_train_tokenized)

4091

In [None]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(X_train_cv, y_train_powerset)
# predict
predictions = classifier.predict(X_test_cv)
# accuracy
print("Accuracy = ",accuracy_score(y_train_powerset,predictions))
print("\n")

### One vs Rest

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [20]:
categories = ['写物','劝勉','家庭','快乐','悲苦','政治','朋友','游玩']

In [22]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(X_train_cv, y_train_powerset[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(X_test_cv)
    print('Test accuracy is {}'.format(accuracy_score(y_test_powerset[category], prediction)))
    print("\n")

**Processing 写物 comments...**




Test accuracy is 0.5818452380952381


**Processing 劝勉 comments...**




Test accuracy is 0.7723214285714286


**Processing 家庭 comments...**




Test accuracy is 0.90625


**Processing 快乐 comments...**




Test accuracy is 0.8690476190476191


**Processing 悲苦 comments...**




Test accuracy is 0.6091269841269841


**Processing 政治 comments...**




Test accuracy is 0.5892857142857143


**Processing 朋友 comments...**




Test accuracy is 0.7847222222222222


**Processing 游玩 comments...**
Test accuracy is 0.8075396825396826


CPU times: user 2.15 s, sys: 6.84 ms, total: 2.16 s
Wall time: 2.3 s


