In [1]:
#import common librarie
from zipfile import ZipFile 
import os.path
from os import path
import pandas as pd
import numpy as np
import base64
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
#import libraries to handle classic chinese word
from jiayan import load_lm
from jiayan import CharHMMTokenizer

# Baseline Model

## Load Data

In [3]:
file_name = "poems_with_new_tags.zip"
poem_df = pd.read_csv('poems_with_new_tags.zip', compression='zip', header=0, quotechar='"')
poem_df.head()

Unnamed: 0,content,dynasty,author,tags,star,author_stars,title,tags_list,new_tags,new_first_tag,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
0,《吴都赋》云：“户藏烟浦，家具画船。”唯吴兴为然。春游之盛，西湖未能过也。己酉岁，予与萧时父...,宋代,姜夔,春游;怀人;,64,279,琵琶仙·《吴都赋》云：「户藏烟浦,"['春游', '怀人']",政治;游玩,政治,0,0,0,0,0,1,0,1
1,《廿一史弹词》第三段说秦汉开场词滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几...,明代,杨慎,咏史;抒怀;人生;哲理,3244,131,临江仙·滚滚长江东逝水,"['咏史', '抒怀', '人生', '哲理']",写物;劝勉;悲苦;政治,写物,1,1,0,0,1,1,0,0
2,《水经》云：“彭蠡之口有石钟山焉。”郦元以为下临深潭，微风鼓浪，水石相搏，声如洪钟。是说也，...,宋代,苏轼,古文观止;纪游;写景;写山,306,4011,石钟山记,"['古文观止', '纪游', '写景', '写山']",写物;政治;游玩,写物,1,0,0,0,0,1,0,1
3,【序】辛亥之冬，予载雪诣石湖。止既月，授简索句，且征新声，作此两曲。石湖把玩不已，使工妓隶习...,宋代,姜夔,咏物;梅花,74,279,暗香疏影,"['咏物', '梅花']",劝勉,劝勉,0,1,0,0,0,0,0,0
4,〔一枝花〕 攀出墙朵朵花，折临路枝枝柳。花攀红蕊嫩，柳折翠条柔，浪子风流。凭着我折柳攀花手...,元代,关汉卿,;散曲;抒情;生活,140,124,【南吕】一枝花不伏老,"['散曲', '抒情', '生活']",写物;快乐;悲苦,写物,1,0,0,1,1,0,0,0


## Split Train and Test Data

In [4]:
poems_info = poem_df[['content','dynasty','author','title']]
tags = poem_df.drop(['content','dynasty','author','title','star','author_stars'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(poems_info, tags, test_size=0.33)

In [5]:
print(len(X_train)==len(y_train))

True


In [6]:
X_train.head()

Unnamed: 0,content,dynasty,author,title
543,今夜鄜州月，闺中只独看。遥怜小儿女，未解忆长安。香雾云鬟湿，清辉玉臂寒。何时倚虚幌，双照泪痕干。,唐代,杜甫,月夜
5181,谷口春残黄鸟稀，辛夷花尽杏花飞。(谷口一作：溪上)始怜幽竹山窗下，不改清阴待我归。,唐代,钱起,晚春归山居题窗前竹暮春归故山草堂诗
2990,朝云乱人目，帝女湘川宿。折菡巫山下，采荇洞庭腹。故以轻薄好，千里命舻舳。何事非相思，江上葳蕤竹。,南北朝,吴均,登二妃庙
3585,浮云护月，未放满朱扉。鼠摇暗壁，萤度破窗，偷入书帏。秋意浓，闲伫立，庭柯影里。好风襟袖先知。...,宋代,周邦彦,四园竹·浮云护月
2312,惠远祠前晋溪水，翠叶银花清见底。水上西山如挂屏，郁郁苍苍三十里。中原北门形势雄，想见城阙云烟...,金朝,元好问,过晋阳故城书事


In [7]:
y_train.head()

Unnamed: 0,tags,tags_list,new_tags,new_first_tag,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
543,唐诗三百首;高中古诗;月亮;思念;女子,"['唐诗三百首', '高中古诗', '月亮', '思念', '女子']",写物;悲苦,写物,1,0,0,0,1,0,0,0
5181,写景;抒情,"['写景', '抒情']",悲苦;政治,悲苦,0,0,0,0,1,1,0,0
2990,咏史怀古;赞颂;写人;爱情;忠贞,"['咏史怀古', '赞颂', '写人', '爱情', '忠贞']",写物;家庭;悲苦;政治,写物,1,0,1,0,1,1,0,0
3585,秋天;夜晚;抒情;思念,"['秋天', '夜晚', '抒情', '思念']",悲苦;政治;游玩,悲苦,0,0,0,0,1,1,0,1
2312,咏史怀古,['咏史怀古'],政治,政治,0,0,0,0,0,1,0,0


## Tokenizing

In [8]:
lm = load_lm('jiayan.klm')

In [9]:
from jiayan import WordNgramTokenizer

In [11]:
#Naive Bayes
tokenizer = CharHMMTokenizer(lm)
# print(list(tokenizer.tokenize(text)))


def tokenize_poem(some_list, f):
    result = []
    for i, x in enumerate(some_list):
        result.append(f(x))
    return result
X_train_tokenized = tokenize_poem(X_train['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
print("get there")
X_test_tokenized = tokenize_poem(X_test['content'], lambda x: " ".join(list(tokenizer.tokenize(x))))
count_vect = CountVectorizer()


get there


In [12]:
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_train['new_first_tag'])
y_test_encoded = Encoder.fit_transform(y_test['new_first_tag'])

In [13]:
X_train_tokenized[0]

'今夜 鄜州 月 ， 闺中 只 独 看 。 遥 怜 小儿 女 ， 未解 忆长安 。 香雾 云鬟 湿 ， 清辉 玉臂 寒 。 何 时 倚虚幌 ， 双 照 泪痕 干 。'

In [14]:
X_train_cv = count_vect.fit_transform(X_train_tokenized)
X_test_cv = count_vect.transform(X_test_tokenized)

In [15]:
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(Corpus['text_final'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)

## Other Models

In [32]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


In [20]:
y_train_powerset = y_train.drop(['tags','tags_list','new_tags','new_first_tag'], axis=1)
y_test_powerset = y_test.drop(['tags','tags_list','new_tags','new_first_tag'], axis=1)

In [35]:
y_train_powerset.head()

Unnamed: 0,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
543,1,0,0,0,1,0,0,0
5181,0,0,0,0,1,1,0,0
2990,1,0,1,0,1,1,0,0
3585,0,0,0,0,1,1,0,1
2312,0,0,0,0,0,1,0,0


In [23]:
# # initialize label powerset multi-label classifier
# classifier = LabelPowerset(LogisticRegression())
# # train
# classifier.fit(X_train_cv, y_train_powerset)
# # predict
# predictions = classifier.predict(X_test_cv)
# # accuracy
# print("Accuracy = ",accuracy_score(y_train_powerset,predictions))
# print("\n")

### One vs Rest

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [25]:
categories = ['写物','劝勉','家庭','快乐','悲苦','政治','朋友','游玩']

In [26]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(X_train_cv, y_train_powerset[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(X_test_cv)
    print('Test accuracy is {}'.format(accuracy_score(y_test_powerset[category], prediction)))
    print("\n")

**Processing 写物 comments...**




Test accuracy is 0.6011904761904762


**Processing 劝勉 comments...**




Test accuracy is 0.7713293650793651


**Processing 家庭 comments...**




Test accuracy is 0.9002976190476191


**Processing 快乐 comments...**




Test accuracy is 0.8586309523809523


**Processing 悲苦 comments...**




Test accuracy is 0.6190476190476191


**Processing 政治 comments...**




Test accuracy is 0.5868055555555556


**Processing 朋友 comments...**




Test accuracy is 0.7901785714285714


**Processing 游玩 comments...**
Test accuracy is 0.810515873015873


CPU times: user 2.29 s, sys: 15.1 ms, total: 2.31 s
Wall time: 2.46 s




In [30]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
NB_pipeline = Pipeline([('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=-1))])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    NB_pipeline.fit(X_train_cv, y_train_powerset[category])
    
    # calculating test accuracy
    prediction = NB_pipeline.predict(X_test_cv)
    print('Test accuracy is {}'.format(accuracy_score(y_test_powerset[category], prediction)))
    print("\n")

**Processing 写物 comments...**
Test accuracy is 0.6026785714285714


**Processing 劝勉 comments...**
Test accuracy is 0.7544642857142857


**Processing 家庭 comments...**
Test accuracy is 0.8963293650793651


**Processing 快乐 comments...**
Test accuracy is 0.8521825396825397


**Processing 悲苦 comments...**
Test accuracy is 0.6041666666666666


**Processing 政治 comments...**
Test accuracy is 0.6021825396825397


**Processing 朋友 comments...**
Test accuracy is 0.7703373015873016


**Processing 游玩 comments...**
Test accuracy is 0.7971230158730159


CPU times: user 86.7 ms, sys: 4.1 ms, total: 90.8 ms
Wall time: 96.1 ms


In [33]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
SVC_pipeline = Pipeline([('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1))])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    SVC_pipeline.fit(X_train_cv, y_train_powerset[category])
    
    # calculating test accuracy
    prediction = SVC_pipeline.predict(X_test_cv)
    print('Test accuracy is {}'.format(accuracy_score(y_test_powerset[category], prediction)))
    print("\n")

**Processing 写物 comments...**




Test accuracy is 0.560515873015873


**Processing 劝勉 comments...**




Test accuracy is 0.7202380952380952


**Processing 家庭 comments...**
Test accuracy is 0.8893849206349206


**Processing 快乐 comments...**
Test accuracy is 0.8368055555555556


**Processing 悲苦 comments...**
Test accuracy is 0.5868055555555556


**Processing 政治 comments...**




Test accuracy is 0.5709325396825397


**Processing 朋友 comments...**
Test accuracy is 0.7633928571428571


**Processing 游玩 comments...**
Test accuracy is 0.7683531746031746


CPU times: user 2.1 s, sys: 0 ns, total: 2.1 s
Wall time: 2.14 s


In [42]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
DT_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(DecisionTreeClassifier(), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    DT_pipeline.fit(X_train_cv, y_train_powerset[category])
    
    # calculating test accuracy
    prediction = DT_pipeline.predict(X_test_cv)
    print('Test accuracy is {}'.format(accuracy_score(y_test_powerset[category], prediction)))
    print("\n")

**Processing 写物 comments...**
Test accuracy is 0.5466269841269841


**Processing 劝勉 comments...**
Test accuracy is 0.6969246031746031


**Processing 家庭 comments...**
Test accuracy is 0.8412698412698413


**Processing 快乐 comments...**
Test accuracy is 0.8065476190476191


**Processing 悲苦 comments...**
Test accuracy is 0.5749007936507936


**Processing 政治 comments...**
Test accuracy is 0.5550595238095238


**Processing 朋友 comments...**
Test accuracy is 0.7341269841269841


**Processing 游玩 comments...**
Test accuracy is 0.7371031746031746


CPU times: user 23.3 s, sys: 32 µs, total: 23.3 s
Wall time: 23.3 s


### Clasification Chain

In [34]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(X_train_cv, y_train_powerset)

# predict
predictions = classifier.predict(X_test_cv)

accuracy_score(y_test_powerset,predictions)

0.054563492063492064

In [37]:
# initialize label powerset multi-label classifier
classifier = LabelPowerset(GaussianNB())
# train
classifier.fit(X_train_cv, y_train_powerset)
# predict
predictions = classifier.predict(X_test_cv)


ValueError: Found input variables with inconsistent numbers of samples: [4091, 2016]

In [38]:
# accuracy
print("Accuracy = ",accuracy_score(y_test_powerset,predictions))
print("\n")

Accuracy =  0.06944444444444445




In [40]:
predictions

<1x8 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in List of Lists format>