In [15]:
#import common librarie
from zipfile import ZipFile 
import os.path
from os import path
import pandas as pd
import numpy as np
import base64
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
#import libraries to handle classic chinese word
from jiayan import load_lm
from jiayan import CharHMMTokenizer

# Baseline Model

## Load Data

In [3]:
file_name = "poems_with_tags.zip"
poem_df = pd.read_csv('poems_with_tags.zip', compression='zip', header=0, quotechar='"')
poem_df.head()

Unnamed: 0,content,dynasty,author,tags,star,author_stars,title,tags_list
0,《吴都赋》云：“户藏烟浦，家具画船。”唯吴兴为然。春游之盛，西湖未能过也。己酉岁，予与萧时父...,宋代,姜夔,春游;怀人;,64,279,琵琶仙·《吴都赋》云：「户藏烟浦,"['春游', '怀人']"
1,《廿一史弹词》第三段说秦汉开场词滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几...,明代,杨慎,咏史;抒怀;人生;哲理,3244,131,临江仙·滚滚长江东逝水,"['咏史', '抒怀', '人生', '哲理']"
2,《水经》云：“彭蠡之口有石钟山焉。”郦元以为下临深潭，微风鼓浪，水石相搏，声如洪钟。是说也，...,宋代,苏轼,古文观止;纪游;写景;写山,306,4011,石钟山记,"['古文观止', '纪游', '写景', '写山']"
3,【序】辛亥之冬，予载雪诣石湖。止既月，授简索句，且征新声，作此两曲。石湖把玩不已，使工妓隶习...,宋代,姜夔,咏物;梅花,74,279,暗香疏影,"['咏物', '梅花']"
4,〔一枝花〕 攀出墙朵朵花，折临路枝枝柳。花攀红蕊嫩，柳折翠条柔，浪子风流。凭着我折柳攀花手...,元代,关汉卿,;散曲;抒情;生活,140,124,【南吕】一枝花不伏老,"['散曲', '抒情', '生活']"


In [4]:
poem_df['first_tag'] = poem_df \
.apply(lambda row : row.tags_list[1:-1]
       .replace("'","")
       .replace(",", "")
       .split(" ")[0], axis=1)

## Split Train and Test Data

In [41]:
poems_info = poem_df[['content','dynasty','author','title']]
tags = poem_df[['first_tag', 'tags_list']]
X_train, X_test, y_train, y_test = train_test_split(poems_info, tags, test_size=0.33)

In [42]:
X_train.head()

Unnamed: 0,content,dynasty,author,title
5411,都护新灭胡，士马气亦粗。萧条虏尘净，突兀天山孤。,唐代,岑参,灭胡曲
316,中和癸卯春三月，洛阳城外花如雪。东西南北路人绝，绿杨悄悄香尘灭。路旁忽见如花人，独向绿杨阴下...,唐代,韦庄,秦妇吟
306,两人对酌山花开，一杯一杯复一杯。我醉欲眠卿且去，明朝有意抱琴来。,唐代,李白,山中与幽人对酌
1028,午梦扁舟花底，香满西湖烟水。急雨打篷声，梦初惊。却是池荷跳雨，散了真珠还聚。聚作水银窝，泻清...,宋代,杨万里,昭君怨·咏荷上雨
1846,小径红稀，芳郊绿遍。高台树色阴阴见。春风不解禁杨花，蒙蒙乱扑行人面。(蒙通：濛)翠叶藏莺，朱...,宋代,晏殊,踏莎行·小径红稀


In [43]:
y_train.head()

Unnamed: 0,first_tag,tags_list
5411,边塞,"['边塞', '生活', '豪迈']"
316,叙事,"['叙事', '长诗']"
306,生活,"['生活', '饮酒']"
1028,咏物,"['咏物', '写雨']"
1846,婉约,"['婉约', '春天', '写景', '抒情', '怅惘']"


## Tokenizing

In [8]:
lm = load_lm('jiayan.klm')

In [9]:
#Naive Bayes
tokenizer = CharHMMTokenizer(lm)
# print(list(tokenizer.tokenize(text)))


def tokenize_poem(some_list, f):
    return [f(x) for x in some_list]
X_train_tokenized = tokenize_poem(X_train['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
X_test_tokenized = tokenize_poem(X_test['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
count_vect = CountVectorizer()


In [25]:
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_train['first_tag'])
y_test_encoded = Encoder.fit_transform(y_test['first_tag'])

In [10]:
X_train_cv = count_vect.fit_transform(X_train_tokenized)
X_test_cv = count_vect.transform(X_test_tokenized)

In [None]:
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(Corpus['text_final'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)

## NB Model

### Build Model

In [None]:
# word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
# top_words_df = pd.DataFrame(word_freq.sum()).sort_values(0, ascending=False)

In [27]:
naive_bayes = MultinomialNB()


### Train the Model

In [29]:
naive_bayes.fit(X_train_cv, y_train['first_tag'])
predictions = naive_bayes.predict(X_test_cv)

In [30]:
predictions

array(['春天', '唐诗三百首', '写景', ..., '唐诗三百首', '古文观止', '古文观止'], dtype='<U11')

In [31]:
print('Accuracy score: ', accuracy_score(y_test['first_tag'], predictions))
print('Precision score: ', precision_score(y_test['first_tag'], predictions, average='weighted'))
print('Recall score: ', recall_score(y_test['first_tag'], predictions, average='weighted'))

Accuracy score:  0.13663366336633664
Precision score:  0.09343189343064562
Recall score:  0.13663366336633664


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM Model

### Build the Model

In [36]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

### Train the Model

In [37]:
SVM.fit(X_train_cv, y_train['first_tag'])
predictions_SVM = SVM.predict(X_test_cv)

### Evaluation

In [39]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test['first_tag'])*100)

SVM Accuracy Score ->  11.93069306930693
