README: The scripts below used to build the basic pipeline of classification modeling. <br>
 -- updated: scripts add countvectorizer() and tf-idf transformer to build word vectors; also remove non-Chinese characters, stopwords, digits, and punctuations.

In [67]:
import pandas as pd
import os
import re
import glob
import numpy as np
import time
import jieba
import jieba.posseg as pseg
import jieba.analyse
import warnings
warnings.filterwarnings('ignore')
os.chdir('/Users/liyuan/desktop/CSAir/codes')

from tokenization import Tokenization

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

class PrepareData():
    def __init__(self):
        self.data = pd.DataFrame()
        self.train = pd.DataFrame()
        self.test = pd.DataFrame()

    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)
        return self.data
    
    def split_data(self):
        self.train, self.test = train_test_split(self.data, test_size = 0.33, random_state=42)
        print('training data has %d examples' %len(self.train))
        print('test data has %d examples' %len(self.test))
        return self.train, self.test
    
    def preprocess_data(self):
        '''use countvectorizer and tf-idf transformer to get valid one-hot encoding for reviews'''
        # use countVectorizer for one-hot encoding
        count_v0= CountVectorizer();  
        counts_all = count_v0.fit_transform(self.data['review_tokens'])
        count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_)  
        counts_train = count_v1.fit_transform(self.train.review_tokens)
        print ("the shape of train word vectors is "+repr(counts_train.shape))

        count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
        counts_test = count_v2.fit_transform(self.test.review_tokens)
        print ("the shape of test word vectors is "+repr(counts_test.shape))

        # implement tf-idf
        tfidftransformer = TfidfTransformer()
        train_data = tfidftransformer.fit(counts_train).transform(counts_train)
        test_data = tfidftransformer.fit(counts_test).transform(counts_test)
        
        X_train = train_data
        y_train = self.train.label_encoded
        X_test = test_data
        y_test = self.test.label_encoded
        return X_train, y_train, X_test, y_test
    
    def get_precision(self,y_pred, y_test):
        '''this function returns a precision score for the model'''
        num = 0
        y_pred = y_pred.tolist()
        for i,pred in enumerate(y_pred):
            if int(pred) == int(y_test.values[i]):
                num += 1
        precision = float(num) / len(y_pred)
        print('precision: '+'{:.2f}'.format(precision))
        return precision
   

In [70]:
data_p = PrepareData()
data_p.load_data('./res/all_labeled_data.csv')
train, test = data_p.split_data()
X_train, y_train, X_test, y_test = data_p.preprocess_data()

training data has 1139 examples
test data has 561 examples
the shape of train word vectors is (1139, 4246)
the shape of test word vectors is (561, 4246)


In [64]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB  
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

class Modeling():
    def __init__(self,X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
    def get_precision(self,y_pred):
        '''this function returns a precision score for the model'''
        num = 0
        y_pred = y_pred.tolist()
        for i,pred in enumerate(y_pred):
            if int(pred) == int(self.y_test.values[i]):
                num += 1
        precision = float(num) / len(y_pred)
        #print('precision: '+'{:.2f}'.format(precision))
        return precision
    
    def get_clf_result(self,model):
        clf = model   
        clf.fit(self.X_train, self.y_train);
        y_pred = clf.predict(self.X_test)
        result = classification_report(self.y_test, y_pred)
        print('performance of classifier:')
        print(result)
        
        # get average accuracy score across classes
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print('accuracy scores:',scores)
        print('average accuracy score:'+ '{:.2f}'.format(np.average(scores)))

        # use precision as evaluation metrics
        precision = self.get_precision(y_pred)
        return precision
    
    def grid_search(self,model, parameters):
        # use "f1_weightes" as evaluation metrics
        clf = GridSearchCV(model, parameters, cv=5, scoring = 'f1_weighted')
        clf.fit(self.X_train, self.y_train)
        print('best parameters of clf are: ')
        return clf.best_params_

In [71]:
m = Modeling(X_train, y_train, X_test, y_test)
print('precision of clf: {:.3f}'.format(m.get_clf_result(MultinomialNB(alpha = 0.01))))
print('=========================================================')
print('precision of clf: {:.3f}'.format(m.get_clf_result(LogisticRegression(C=1, penalty='l1'))))
print('=========================================================')
print('precision of clf: {:.3f}'.format(m.get_clf_result(SVC(kernel='linear'))))
print('=========================================================')
print('try GridSearch ...')
model = LogisticRegression()
parameters = {'penalty':('l1', 'l2'), 'C':[0.1, 1, 10]}
m.grid_search(model, parameters)

performance of classifier:
              precision    recall  f1-score   support

           0       0.48      0.48      0.48        50
           1       0.51      0.52      0.52       134
           2       0.52      0.57      0.54        44
           3       0.65      0.60      0.62        67
           4       0.34      0.41      0.37        46
           5       0.64      0.64      0.64        94
           6       0.33      0.33      0.33        21
           7       0.29      0.27      0.28        15
           8       0.28      0.23      0.25        22
           9       0.56      0.51      0.54        68

   micro avg       0.52      0.52      0.52       561
   macro avg       0.46      0.46      0.46       561
weighted avg       0.52      0.52      0.52       561

accuracy scores: [0.53246753 0.50649351 0.48908297 0.49115044 0.47747748]
average accuracy score:0.50
precision of clf: 0.515
performance of classifier:
              precision    recall  f1-score   support

      

{'C': 1, 'penalty': 'l1'}