README: The scripts below used to build the basic pipeline of classification modeling. More to try include: <br>
 - embedding: try pretrained models
 - add: tf-idf processing
 - modeling: try other modeling methods except for naive bayes; hyperparameter tuning
 

In [61]:
import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore')

import jieba
import jieba.posseg as pseg
import jieba.analyse

import glob
import numpy as np
import time

os.chdir('/Users/liyuan/desktop/CSAir')

In [62]:
'''
combine dataset (multiple categories) into one single category;
add a column called 'label'
'''

files= glob.glob('./output_data/*.txt')

df_lst = []
for f in files:
    label = f.split('/')[-1][:2]
    df = pd.read_csv(f,header=None)
    df['label'] = label
    df_lst.append(df)

all_df = pd.concat(df_lst)
print('the whole dataset include %d reviews'%len(all_df))
all_df = all_df.rename(columns = {0:'review_tokens'})
all_df.head(10)

the whole dataset include 1623 reviews


Unnamed: 0,review_tokens,label
0,11 月 15 日 提前 预订 2018 年 11 月 27 日 长沙 飞往 沈阳 cz3...,出发
1,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发
2,重庆 乌鲁木齐 南航 航班 天气 原因 延误 和田 乘坐 天津 航班,出发
3,沿途 停靠 理解 延误 小时,出发
4,飞机 无故 延误 小时 脸,出发
5,延误 五个 小时 算上 值机 时间 机场 八个 小时 早上 晚上 解释 解决方案 机长 人影...,出发
6,cz3842 航班 延误 投诉无门 十点 五十 起飞 下午 三点 弄 飞机 两个 小时 告知...,出发
7,南航 航班 延误 发 短信 太 严谨 回复 改 航班 用户名 密码 我要 变更 航班 做 延...,出发
8,行李 延误 重大损失,出发
9,确认 航班 延误 订 票 显示 确认,出发


In [63]:
# encode text label into numbers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
targets = le.fit_transform(all_df.label)
all_df['label_encoded'] = targets
all_df.head()

Unnamed: 0,review_tokens,label,label_encoded
0,11 月 15 日 提前 预订 2018 年 11 月 27 日 长沙 飞往 沈阳 cz3...,出发,1
1,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发,1
2,重庆 乌鲁木齐 南航 航班 天气 原因 延误 和田 乘坐 天津 航班,出发,1
3,沿途 停靠 理解 延误 小时,出发,1
4,飞机 无故 延误 小时 脸,出发,1


In [65]:
# all_df.to_csv('res/all_labeled_data.csv',index = False)

In [67]:
all_df = pd.read_csv('res/all_labeled_data.csv')

In [3]:
# get the data size for each label
labels = all_df.label.unique().tolist()
label_size = {}
for label in labels:
    label_size[label] = len(all_df[all_df.label == label])

print(label_size)

{'出发': 352, '到达': 147, '性能': 148, '售后': 166, '设计': 47, '计划': 38, '机上': 299, '预订': 218, '中转': 147, '行程': 61}


In [16]:
### train test split data
from sklearn.model_selection import train_test_split

train, test = train_test_split(all_df, test_size=0.33, random_state=42)
print('training data has %d examples' %len(train))
print('test data has %d examples' %len(test))

training data has 1087 examples
test data has 536 examples


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

# use countVectorizer for one-hot encoding
count_v0= CountVectorizer();  
counts_all = count_v0.fit_transform(all_df['review_tokens'])
count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_)  
 
counts_train = count_v1.fit_transform(train.review_tokens)
print ("the shape of train is "+repr(counts_train.shape))

count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
counts_test = count_v2.fit_transform(test.review_tokens)
print ("the shape of test is "+repr(counts_test.shape))

# implement tf-idf
tfidftransformer = TfidfTransformer();    
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);

the shape of train is (1087, 4633)
the shape of test is (536, 4633)


In [14]:
print(train_data.shape)
print(test_data.shape)

(1087, 4633)
(536, 4633)


In [18]:
X_train = train_data
y_train = train.label_encoded
X_test = test_data
y_test = test.label_encoded

In [51]:
def get_precision(y_pred, y_test):
    '''this function returns a precision score for the model'''
    num = 0
    y_pred = y_pred.tolist()
    for i,pred in enumerate(y_pred):
        if int(pred) == int(y_test.values[i]):
            num += 1
    precision = float(num) / len(y_pred)
    print('precision: '+'{:.2f}'.format(precision))
    return precision

In [52]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB  
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# use Naive bayes to build classifier
clf = MultinomialNB(alpha = 0.01)   
clf.fit(X_train, y_train);
y_pred = clf.predict(X_test)

# get classification report
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# get average accuracy score across classes
scores = cross_val_score(clf, X_train, y_train, cv=5)
print('scores:',scores)
print('average accuracy score:'+ '{:.2f}'.format(np.average(scores)))

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.42      0.56      0.48        39
           1       0.48      0.50      0.49       116
           2       0.50      0.37      0.43        59
           3       0.52      0.57      0.54        53
           4       0.34      0.27      0.30        56
           5       0.64      0.58      0.61        93
           6       0.40      0.38      0.39        21
           7       0.11      0.12      0.12         8
           8       0.26      0.46      0.33        13
           9       0.49      0.50      0.50        78

   micro avg       0.48      0.48      0.48       536
   macro avg       0.42      0.43      0.42       536
weighted avg       0.48      0.48      0.48       536

scores: [0.47511312 0.47945205 0.48165138 0.48372093 0.48130841]
average accuracy score:0.48
precision: 0.48


In [45]:
# use logistic regression as classifier and use grid search to find best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

parameters = {'penalty':('l1', 'l2'), 'C':[0.1, 1, 10]}
model = LogisticRegression()

# use "f1_weightes" as evaluation metrics (see below more explanation)
clf = GridSearchCV(model, parameters, cv=5, scoring = 'f1_weighted')
clf.fit(X_train, y_train)
print(clf.best_params_)

{'C': 1, 'penalty': 'l1'}
performance of classifier:
              precision    recall  f1-score   support

           0       0.55      0.54      0.55        39
           1       0.56      0.56      0.56       116
           2       0.75      0.46      0.57        59
           3       0.68      0.51      0.58        53
           4       0.56      0.18      0.27        56
           5       0.44      0.87      0.58        93
           6       0.76      0.62      0.68        21
           7       0.33      0.12      0.18         8
           8       0.62      0.38      0.48        13
           9       0.66      0.62      0.64        78

   micro avg       0.56      0.56      0.56       536
   macro avg       0.59      0.49      0.51       536
weighted avg       0.59      0.56      0.54       536





In [54]:
# use the best parameter for logistic regression
model = LogisticRegression(C=1, penalty='l1')
y_pred = clf.predict(X_test)
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.42      0.56      0.48        39
           1       0.48      0.50      0.49       116
           2       0.50      0.37      0.43        59
           3       0.52      0.57      0.54        53
           4       0.34      0.27      0.30        56
           5       0.64      0.58      0.61        93
           6       0.40      0.38      0.39        21
           7       0.11      0.12      0.12         8
           8       0.26      0.46      0.33        13
           9       0.49      0.50      0.50        78

   micro avg       0.48      0.48      0.48       536
   macro avg       0.42      0.43      0.42       536
weighted avg       0.48      0.48      0.48       536

precision: 0.48


In [56]:
from sklearn.svm import SVC

# implement linearSVC
model = SVC(kernel='linear')
y_pred = clf.predict(X_test)
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.42      0.56      0.48        39
           1       0.48      0.50      0.49       116
           2       0.50      0.37      0.43        59
           3       0.52      0.57      0.54        53
           4       0.34      0.27      0.30        56
           5       0.64      0.58      0.61        93
           6       0.40      0.38      0.39        21
           7       0.11      0.12      0.12         8
           8       0.26      0.46      0.33        13
           9       0.49      0.50      0.50        78

   micro avg       0.48      0.48      0.48       536
   macro avg       0.42      0.43      0.42       536
weighted avg       0.48      0.48      0.48       536

precision: 0.48
