README: The scripts below used to build the basic pipeline of classification modeling. <br>
 -- updated: scripts add countvectorizer() and tf-idf transformer to build word vectors; also remove non-Chinese characters, stopwords, digits, and punctuations.

In [13]:
import pandas as pd
import os
import re
import glob
import numpy as np
import time
import jieba
import jieba.posseg as pseg
import jieba.analyse
import warnings
warnings.filterwarnings('ignore')
os.chdir('/Users/liyuan/desktop/CSAir')

In [16]:
class Tokenization():
#     def __init__(self, input_data, output_name, stopwords):
    def __init__(self, input_path, output_name, stopwords):
        #self.input = input_data[:]
        self.input_path = input_path
        self.input = []
        self.output_name = str(output_name)
        self.sentences = []
        self.tfidf_score = []
        self.stopwords = stopwords
    
    def load_input_data(self):
        with open(self.input_path, 'r', encoding='utf-8') as input_file:
            self.input +=  input_file.readlines()
            self.input = self.input[:]
        return self.input
        
    def get_tokenized_sents(self):        
        for sent in self.input:
            tokenized_sent = ' '.join(word for word in jieba.cut(sent.strip()) if word not in self.stopwords)
            # remove digits
            tokenized_sent = re.sub(r'\d+','',tokenized_sent)
            # remove punctuation
            tokenized_sent = re.sub(r'[^\w\s]','', tokenized_sent)
            # remove non-chinese characters
            # match all Chinese words
            re_words = re.compile(u"[\u4e00-\u9fa5]+")
            res = re.findall(re_words, tokenized_sent)
            if res:
                valid_tokenized_sent = ' '.join([r for r in res])
            self.sentences.append(valid_tokenized_sent)
        
        with open(self.output_name + '.txt','w',newline='') as output_file:
            for line in self.sentences:
                output_file.write(line + '\n')  
 
        return self.sentences
    
    def get_topN_tf_idf(self, content, topK=20):
        tags = jieba.analyse.extract_tags(content, topK)
        return " ".join(tags)

In [17]:
stopwords = [line.strip() for line in open('Source_Data/stopwords.txt', 'r', encoding='utf-8').readlines()]   
categories = ['中转','出发','到达','售后','性能','机上','行程管理','计划','设计','预订']
for cat in categories:
    # tokenize data for each class
    tok = Tokenization('./Source_Data/CSV_files/'+ cat +'.csv','./Output_Data/output_v2/'+cat,stopwords)
    input_data = tok.load_input_data()
    sampled_reviews_tokenized = tok.get_tokenized_sents()

In [21]:
# combine "output" data files and add encoded label as new column
files= glob.glob('./Output_Data/output_v2/*.txt')
df_lst = []
for f in files:
    label = f.split('/')[-1][:2]
    df = pd.read_csv(f,header=None)
    df['label'] = label
    df_lst.append(df)
labeled_df = pd.concat(df_lst)
print('the whole dataset include %d reviews'%len(labeled_df))
labeled_df = labeled_df.rename(columns = {0:'review_tokens'})

# encode text label into numbers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
targets = le.fit_transform(labeled_df.label)
labeled_df['label_encoded'] = targets
labeled_df.head(10)

the whole dataset include 1700 reviews


Unnamed: 0,review_tokens,label,label_encoded
0,优惠券,预订,9
1,优惠券,预订,9
2,买票 提醒 优惠券,预订,9
3,专享 优惠 入口 机票 预订 选择 添加 请问 情况,预订,9
4,参加 月份 会员 日 活动 抽 中国 往返机票 免票 客服 答复 说 月 日 发放 优惠券 ...,预订,9
5,预定 今日 机票 仓 符合 优惠券 条件,预订,9
6,上次 买 机票 买不到 优惠 感觉 很坑 浪费 时间,预订,9
7,注册 会员 会员 专享 购票 优惠活动 朋友 操作 试过 安卓 手机 苹果 手机 不行 客服...,预订,9
8,连云港 飞 广州 会员 日 一月份 日期 优惠 力度 一点 乘客 谢谢,预订,9
9,公务舱 优惠 研究 明白 南航 查某 航班 价格 携程 查 南航 搞笑,预订,9


In [22]:
# get the data size for each label
# may have duplication between classes
labels = labeled_df.label.unique().tolist()
label_size = {}
for label in labels:
    label_size[label] = len(labeled_df[labeled_df.label == label])
print(label_size)

{'预订': 222, '出发': 367, '设计': 50, '性能': 153, '到达': 151, '行程': 61, '机上': 308, '计划': 39, '中转': 149, '售后': 200}


In [23]:
### train test split data
from sklearn.model_selection import train_test_split
train, test = train_test_split(labeled_df, test_size=0.33, random_state=42)
print('training data has %d examples' %len(train))
print('test data has %d examples' %len(test))

training data has 1139 examples
test data has 561 examples


In [34]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

def get_word_vectors(data,train,test):
    '''use countvectorizer and tf-idf transformer to get valid one-hot encoding for reviews'''
    # use countVectorizer for one-hot encoding
    count_v0= CountVectorizer();  
    counts_all = count_v0.fit_transform(data['review_tokens'])
    count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_)  
    counts_train = count_v1.fit_transform(train.review_tokens)
    print ("the shape of train word vectors is "+repr(counts_train.shape))

    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
    counts_test = count_v2.fit_transform(test.review_tokens)
    print ("the shape of test word vectors is "+repr(counts_test.shape))

    # implement tf-idf
    tfidftransformer = TfidfTransformer()
    train_data = tfidftransformer.fit(counts_train).transform(counts_train)
    test_data = tfidftransformer.fit(counts_test).transform(counts_test)
    return train_data, test_data

train_data, test_data = get_word_vectors(labeled_df,train,test)

the shape of train word vectors is (1139, 4246)
the shape of test word vectors is (561, 4246)


In [28]:
def get_precision(y_pred, y_test):
    '''this function returns a precision score for the model'''
    num = 0
    y_pred = y_pred.tolist()
    for i,pred in enumerate(y_pred):
        if int(pred) == int(y_test.values[i]):
            num += 1
    precision = float(num) / len(y_pred)
    print('precision: '+'{:.2f}'.format(precision))
    return precision

In [27]:
X_train = train_data
y_train = train.label_encoded
X_test = test_data
y_test = test.label_encoded

In [29]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB  
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# use Naive bayes to build classifier
clf = MultinomialNB(alpha = 0.01)   
clf.fit(X_train, y_train);
y_pred = clf.predict(X_test)

# get classification report
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# get average accuracy score across classes
scores = cross_val_score(clf, X_train, y_train, cv=5)
print('scores:',scores)
print('average accuracy score:'+ '{:.2f}'.format(np.average(scores)))

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.48      0.48      0.48        50
           1       0.51      0.52      0.52       134
           2       0.52      0.57      0.54        44
           3       0.65      0.60      0.62        67
           4       0.34      0.41      0.37        46
           5       0.64      0.64      0.64        94
           6       0.33      0.33      0.33        21
           7       0.29      0.27      0.28        15
           8       0.28      0.23      0.25        22
           9       0.56      0.51      0.54        68

   micro avg       0.52      0.52      0.52       561
   macro avg       0.46      0.46      0.46       561
weighted avg       0.52      0.52      0.52       561

scores: [0.53246753 0.50649351 0.48908297 0.49115044 0.47747748]
average accuracy score:0.50
precision: 0.52


In [30]:
# use logistic regression as classifier and use grid search to find best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

parameters = {'penalty':('l1', 'l2'), 'C':[0.1, 1, 10]}
model = LogisticRegression()

# use "f1_weightes" as evaluation metrics (see below more explanation)
clf = GridSearchCV(model, parameters, cv=5, scoring = 'f1_weighted')
clf.fit(X_train, y_train)
print(clf.best_params_)

{'C': 1, 'penalty': 'l1'}




In [31]:
# use the best parameter for logistic regression
model = LogisticRegression(C=1, penalty='l1')
y_pred = clf.predict(X_test)
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.65      0.52      0.58        50
           1       0.65      0.60      0.63       134
           2       0.63      0.50      0.56        44
           3       0.65      0.55      0.60        67
           4       0.39      0.24      0.30        46
           5       0.42      0.85      0.56        94
           6       0.64      0.43      0.51        21
           7       1.00      0.20      0.33        15
           8       0.00      0.00      0.00        22
           9       0.58      0.59      0.58        68

   micro avg       0.55      0.55      0.55       561
   macro avg       0.56      0.45      0.47       561
weighted avg       0.56      0.55      0.53       561

precision: 0.55


In [32]:
from sklearn.svm import SVC
# implement linearSVC
model = SVC(kernel='linear')
y_pred = clf.predict(X_test)
result = classification_report(y_test, y_pred)
print('performance of classifier:')
print(result)

# use precision as evaluation metrics
precision = get_precision(y_pred, y_test)

performance of classifier:
              precision    recall  f1-score   support

           0       0.65      0.52      0.58        50
           1       0.65      0.60      0.63       134
           2       0.63      0.50      0.56        44
           3       0.65      0.55      0.60        67
           4       0.39      0.24      0.30        46
           5       0.42      0.85      0.56        94
           6       0.64      0.43      0.51        21
           7       1.00      0.20      0.33        15
           8       0.00      0.00      0.00        22
           9       0.58      0.59      0.58        68

   micro avg       0.55      0.55      0.55       561
   macro avg       0.56      0.45      0.47       561
weighted avg       0.56      0.55      0.53       561

precision: 0.55
