In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import  matplotlib.pyplot as plt

# 2 Sentiment Analysis
## 2.2 Movie Review Data

Let us first start by looking at the data provided with the exercise. We have positive and negative movie reviews labeled by human readers, all positive and negative reviews are in the ‘pos’ and ‘neg’ folders respectively. If you look in- side a sample file, you will see that these review messages have been ‘tokenized’, where all words are separated from punctuations.
There are approximately 1000 files in each category with files names starting with cv000, cv001, cv002 and so on. You will split the dataset into training set and testing set.

1. Write some code to load the data from text files.

In [3]:
def readRawData():
    import os, sys, glob
    ## reading file from positive
    posPath = "./review_polarity/txt_sentoken/pos/*.txt"
    posFileNames = glob.glob(os.path.abspath(posPath))
    posRawDatas = []
    for filename in posFileNames:
        with open(filename, "r") as f:
#              f.read()
            posRawDatas.append(f.read())

    ## reading file from negative.
    negPath = "./review_polarity/txt_sentoken/neg/*.txt"
    negFileName = glob.glob(os.path.abspath(negPath))
    negRawDatas = []
    for filename in negFileName:
        with open(filename, "r") as f:
            negRawDatas.append(f.read())
    return posRawDatas, negRawDatas

In [4]:
posRawDatas, negRawDatas = readRawData()
allRawDatas = posRawDatas + negRawDatas

# 2.3 TF-IDF
TF-IDF จะนับจำนวนคำศัพย์ที่มันมีอยู่ใน dictionary ของมันเเล้วนับจำนวนคำศัพย์เหล่านั้น จากนั้นนำจำนวนคำศัพย์มาเรียงเป็น vector

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase = True)
vectorizer = vectorizer.fit(allRawDatas)

In [6]:
def createDataframe(Tfid, pos, neg):
    ## vectorize pos text.
    vectorPos = Tfid.transform(pos).toarray()
    X = vectorPos
    y = [1]*vectorPos.shape[0]

    ## vectorize neg text.
    vectorNeg = Tfid.transform(neg).toarray()
    X = np.vstack((X,vectorNeg))
    y.extend([0]*vectorNeg.shape[0])
    
    y = np.array(y)
    datas = np.hstack((X,y.reshape(-1,1)))
    
    ## create dataframe
    header = vectorizer.get_feature_names() + ['CLASS']
    df = pd.DataFrame(datas, columns = header)

    return df

In [7]:
trainPos = posRawDatas[300:]
testPos = posRawDatas[:300]
trainNeg = negRawDatas[300:]
testNeg = negRawDatas[:300]

In [8]:
dfTrain = createDataframe(vectorizer, trainPos, trainNeg)
dfTrain.head()

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz,CLASS
0,0.0,0.014654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
dfTest = createDataframe(vectorizer, testPos, testNeg)
dfTest.head()

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz,CLASS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.037716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.035846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
print dfTrain.shape
print dfTest.shape

(1400, 39660)
(600, 39660)


# 2.4 Classification

Use 4 different models to classify each movie into positive or negative category.

1. K-Nearestneighbormodel,using module `sklearn.neighbors.KNeighborsClassifier`
2. RandomForest, using module `sklearn.ensemble.RandomForestClassifier`
3. SVM, using module `sklearn.svm.SVC`
4. Neural network, using `sklearn.neural_network.MLPClassifier`

You may pick other models you would like to try. Just present results for at least 4 models.
Please provide your code for model fitting and cross validation. Calculate your classification accuracy, precision, and recall.

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_predict

classifier = {"KNN":KNeighborsClassifier(), "SVM":SVC(), "RF":RandomForestClassifier(), "Neural":MLPClassifier()}

## fit
# for n,clf in classifier.items():
#     print n
#     clf.fit(dfTrain.drop(columns="CLASS"), dfTrain["CLASS"])

## predict
for n,clf in classifier.items():
    y_true, y_pred = dfTrain["CLASS"], cross_val_predict(clf, dfTrain.drop(columns="CLASS"), dfTrain["CLASS"], cv=5)
    print n
    print "acc:", accuracy_score(y_true=y_true, y_pred=y_pred)
    print "pre:", precision_score(y_true=y_true, y_pred=y_pred)
    print "rec:", recall_score(y_true=y_true, y_pred=y_pred)  


KNN
acc: 0.561428571429
pre: 0.534018987342
rec: 0.964285714286
RF
acc: 0.652857142857
pre: 0.69384057971
rec: 0.547142857143
SVM
acc: 0.731428571429
pre: 0.671247357294
rec: 0.907142857143
Neural
acc: 0.839285714286
pre: 0.832167832168
rec: 0.85


# 2.5 ทำการทดลอง 3 การทดลองด้วยกัน ได้แก่
    1. ทดลองปรับค่าของ hyper-parameters ของโมเดลทั้ง 4 โมเดล
    2. ทดลองใช้เทคนิค select K best ดดยใช้ chi2 เป็นเกณฑ์ โดยเลือกค่า k ต่างๆมาใช้
    3. ทดลองใช้ค่า k จากการทดลองข้อ 2 เปรียบเทียบผลที่ได้ระหว่างการใช้ chi2 ANOVA F-value และmutual information

## 2.5.1 KNN

#    2.5.1.1 ทดลองปรับค่า hyper-parameters ของ KNN

In [12]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [13]:
## Use select k best for preprocessing.
## เพื่อที่จะลดเวลา train เท่านั้น
K = 100
X = dfTrain.drop(columns="CLASS")
y = dfTrain["CLASS"]
KBest = SelectKBest(score_func=chi2, k=K)
KBest = KBest.fit(X, y)

headerMask = KBest.get_support(True)
selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
print selectedHeader

Index([       u'54',       u'8mm',  u'anaconda',    u'arnold',     u'awful',
             u'bad',    u'batman',      u'bats',      u'bean',     u'bilko',
       ...
             u'war',     u'waste',    u'wasted',       u'wcw',    u'welles',
             u'why',     u'worst', u'wrestling',      u'zeta',     u'CLASS'],
      dtype='object', length=101)


In [14]:
dfTrainKBest = dfTrain[selectedHeader]
dfTrainKBest.head()

Unnamed: 0,54,8mm,anaconda,arnold,awful,bad,batman,bats,bean,bilko,...,war,waste,wasted,wcw,welles,why,worst,wrestling,zeta,CLASS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020527,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.034795,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027807,0.0,0.0,0.0,1.0


In [15]:
## Search for best parameters
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

parameter = [{'n_neighbors' : [6,7,8,9],
            'weights':['uniform','distance'], 
            'p': [1,2]}]
scoring = {"acc":make_scorer(accuracy_score), 
           "pre":make_scorer(precision_score,average="macro"), 
           "rec":make_scorer(recall_score,average="macro")}

classifier = KNeighborsClassifier( metric = 'minkowski')
grid_search = GridSearchCV( estimator = classifier,
                            param_grid = parameter,
                            scoring = scoring,
                            cv=5,
                            verbose = 1,
                            refit = "acc",
                            return_train_score = True)

In [16]:
grid_search = grid_search.fit( dfTrainKBest.drop(columns="CLASS"),
                                dfTrainKBest["CLASS"])

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.0min finished


In [17]:
print(grid_search.best_params_)

mean_accs = grid_search.cv_results_['mean_test_acc']
mean_pres = grid_search.cv_results_['mean_test_pre']
mean_recs = grid_search.cv_results_['mean_test_rec']

mean_fitTime = grid_search.cv_results_['mean_fit_time']
mean_scrTime = grid_search.cv_results_['mean_score_time']

print("acc\tpre\trec\tftime\tstime")
for params, accu, pre, rec, ftime, stime in zip(grid_search.cv_results_['params'], mean_accs, mean_pres, mean_recs, mean_fitTime, mean_scrTime):
    print("%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\tfor %r"
          % (accu, pre,rec, ftime, stime, params))

{'n_neighbors': 8, 'weights': 'uniform', 'p': 2}
acc	pre	rec	ftime	stime
0.67929	0.68585	0.67929	0.00501	0.16359	for {'n_neighbors': 6, 'weights': 'uniform', 'p': 1}
0.68571	0.70971	0.68571	0.00379	0.16390	for {'n_neighbors': 6, 'weights': 'distance', 'p': 1}
0.71929	0.72340	0.71929	0.00384	0.14768	for {'n_neighbors': 6, 'weights': 'uniform', 'p': 2}
0.71643	0.73371	0.71643	0.00393	0.14551	for {'n_neighbors': 6, 'weights': 'distance', 'p': 2}
0.66071	0.69203	0.66071	0.00367	0.15838	for {'n_neighbors': 7, 'weights': 'uniform', 'p': 1}
0.66286	0.69304	0.66286	0.00368	0.15943	for {'n_neighbors': 7, 'weights': 'distance', 'p': 1}
0.71714	0.74085	0.71714	0.00367	0.14593	for {'n_neighbors': 7, 'weights': 'uniform', 'p': 2}
0.72143	0.74393	0.72143	0.00371	0.14611	for {'n_neighbors': 7, 'weights': 'distance', 'p': 2}
0.68214	0.69511	0.68214	0.00373	0.15963	for {'n_neighbors': 8, 'weights': 'uniform', 'p': 1}
0.67571	0.70737	0.67571	0.00368	0.15984	for {'n_neighbors': 8, 'weights': 'distance', 

# 2.5.1.2 ทดลองใช้ select k best ด้วย k ค่าต่างๆ

In [20]:
## Search best K value.
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

K_list = [100, 1000, 10000, 25000]
# K_list = [50, 75, 100, 125, 150]
# K_list = [65, 70, 75, 80, 85]
# K_list = [79,80,81, 83, 85, 87, 89, 25000]
# K_list = [15000, 20000, 25000, 30000]
# K_list = [30000, 35000, 36000]
# K_list = [9500, 10000, 15000]

result = {}
for k in K_list:
    print k
    X = dfTrain.drop(columns="CLASS")
    y = dfTrain["CLASS"]
    KBest = SelectKBest(score_func=chi2, k=k)
    KBest = KBest.fit(X, y)

    headerMask = KBest.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    dfTrainKBest = dfTrain[selectedHeader]
    dfTestKBest = dfTest[selectedHeader]
    
    clf = KNeighborsClassifier( metric = 'minkowski',
                                        n_neighbors = 8,
                                        weights = 'uniform',
                                        p = 2)
    clf.fit(dfTrainKBest.drop(columns="CLASS"), dfTrainKBest["CLASS"])
    y_true, y_pred = dfTestKBest["CLASS"], clf.predict(dfTestKBest.drop(columns="CLASS"))
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    result[k] = (acc, pre, rec)
print "finish"

100
1000
10000
25000
finish


In [21]:
result

{100: (0.65833333333333333, 0.63848396501457727, 0.72999999999999998),
 1000: (0.65166666666666662, 0.62264150943396224, 0.77000000000000002),
 10000: (0.60833333333333328, 0.57869249394673128, 0.79666666666666663),
 25000: (0.61833333333333329, 0.74149659863945583, 0.36333333333333334)}

# 2.5.1.3 ทดลองใช้ chi2 ANOVA F-value และmutual information กับselect k best

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

k = 1000

X = dfTrain.drop(columns="CLASS")
y = dfTrain["CLASS"]

KChi2 = SelectKBest(score_func=chi2, k=k)

KFClass = SelectKBest(score_func=f_classif, k=k)

KMut = SelectKBest(score_func=mutual_info_classif, k=k)

Kbest = {"chi2":KChi2, "f_classif":KFClass, "mutual_info_classif":KMut}

In [19]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

for n,v in Kbest.items():
    v.fit(X,y)
    
    headerMask = v.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    clf = KNeighborsClassifier( metric = 'minkowski',
                                        n_neighbors = 8,
                                        weights = 'uniform',
                                        p = 2)

    dfTrainK = dfTrain[selectedHeader]
    y_true, y_pred = dfTrainKChi2["CLASS"], cross_val_predict(clf, dfTrainK.drop(columns="CLASS"), dfTrainK["CLASS"], cv=5)

    print n
    print "acc:", accuracy_score(y_true=y_true, y_pred=y_pred)
    print "pre:", precision_score(y_true=y_true, y_pred=y_pred)
    print "rec:", recall_score(y_true=y_true, y_pred=y_pred)  

 chi2
acc: 0.712142857143
pre: 0.672473867596
rec: 0.827142857143
mutual_info_classif
acc: 0.611428571429
pre: 0.575435203095
rec: 0.85
f_classif
acc: 0.727857142857
pre: 0.69618696187
rec: 0.808571428571


# 2.5.1 Random Forrest Classifier

In [22]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [23]:
## Search for best parameters
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

parameter = [{'n_estimators' : [50,100,200],
              'min_samples_split':[0.0001,0.001,0.01,int(2)],
              'max_depth' :[10,20,30,None]}]
scoring = {"acc":make_scorer(accuracy_score), 
           "pre":make_scorer(precision_score,average="macro"), 
           "rec":make_scorer(recall_score,average="macro")}

classifier = RandomForestClassifier(criterion = 'gini', random_state = 0, verbose = 0)
grid_search = GridSearchCV( estimator = classifier,
                            param_grid = parameter,
                            scoring = scoring,
                            cv=5,
                            verbose = 1,
                            refit = "acc",
                            return_train_score = True)

In [24]:
grid_search = grid_search.fit( dfTrain.drop(columns="CLASS"),
                                dfTrain["CLASS"])

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 24.9min finished


In [25]:
print(grid_search.best_params_)

mean_accs = grid_search.cv_results_['mean_test_acc']
mean_pres = grid_search.cv_results_['mean_test_pre']
mean_recs = grid_search.cv_results_['mean_test_rec']

mean_fitTime = grid_search.cv_results_['mean_fit_time']
mean_scrTime = grid_search.cv_results_['mean_score_time']

print("acc\tpre\trec\tftime\tstime")
for params, accu, pre, rec, ftime, stime in zip(grid_search.cv_results_['params'], mean_accs, mean_pres, mean_recs, mean_fitTime, mean_scrTime):
    print("%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\tfor %r"
          % (accu, pre,rec, ftime, stime, params))

{'min_samples_split': 0.01, 'n_estimators': 200, 'max_depth': 20}
acc	pre	rec	ftime	stime
0.75143	0.75369	0.75143	1.85039	0.11528	for {'min_samples_split': 0.0001, 'n_estimators': 50, 'max_depth': 10}
0.78357	0.78551	0.78357	3.43273	0.11493	for {'min_samples_split': 0.0001, 'n_estimators': 100, 'max_depth': 10}
0.79929	0.80170	0.79929	6.48234	0.15368	for {'min_samples_split': 0.0001, 'n_estimators': 200, 'max_depth': 10}
0.75143	0.75369	0.75143	1.90759	0.10899	for {'min_samples_split': 0.001, 'n_estimators': 50, 'max_depth': 10}
0.78357	0.78551	0.78357	3.45328	0.13970	for {'min_samples_split': 0.001, 'n_estimators': 100, 'max_depth': 10}
0.79929	0.80170	0.79929	6.34280	0.18573	for {'min_samples_split': 0.001, 'n_estimators': 200, 'max_depth': 10}
0.75143	0.75278	0.75143	1.82160	0.12366	for {'min_samples_split': 0.01, 'n_estimators': 50, 'max_depth': 10}
0.78357	0.78441	0.78357	3.32449	0.12171	for {'min_samples_split': 0.01, 'n_estimators': 100, 'max_depth': 10}
0.79857	0.79987	0.79857	

In [26]:
## Search best K value.
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

K_list = [100, 1000, 10000, 25000]
# K_list = [50, 75, 100, 125, 150]
# K_list = [65, 70, 75, 80, 85]
# K_list = [79,80,81, 83, 85, 87, 89, 25000]
# K_list = [15000, 20000, 25000, 30000]
# K_list = [30000, 35000, 36000]
K_list = [9800, 9900, 10000, 11000, 12000]

result = {}
for k in K_list:
    print k
    X = dfTrain.drop(columns="CLASS")
    y = dfTrain["CLASS"]
    KBest = SelectKBest(score_func=chi2, k=k)
    KBest = KBest.fit(X, y)

    headerMask = KBest.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    dfTrainKBest = dfTrain[selectedHeader]
    dfTestKBest = dfTest[selectedHeader]
    
    clf =  RandomForestClassifier(criterion = 'gini', 
                                  min_samples_split = 0.01,
                                  n_estimators = 200,
                                  max_depth = 20,
                                  random_state = 0, verbose = 0)
    clf.fit(dfTrainKBest.drop(columns="CLASS"), dfTrainKBest["CLASS"])
    
    y_true, y_pred = dfTestKBest["CLASS"], clf.predict(dfTestKBest.drop(columns="CLASS"))
    
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    result[k] = (acc, rec, pre)
print "finish"

9800
9900
10000
11000
12000
finish


In [27]:
result

{9800: (0.82666666666666666, 0.79333333333333333, 0.84999999999999998),
 9900: (0.83666666666666667, 0.81000000000000005, 0.85563380281690138),
 10000: (0.84166666666666667, 0.81999999999999995, 0.8571428571428571),
 11000: (0.84166666666666667, 0.80333333333333334, 0.87003610108303253),
 12000: (0.82666666666666666, 0.79333333333333333, 0.84999999999999998)}

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

k = 1000

X = dfTrain.drop(columns="CLASS")
y = dfTrain["CLASS"]

KChi2 = SelectKBest(score_func=chi2, k=k)

KFClass = SelectKBest(score_func=f_classif, k=k)

KMut = SelectKBest(score_func=mutual_info_classif, k=k)

Kbest = {"chi2":KChi2, "f_classif":KFClass, "mutual_info_classif":KMut}

In [23]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

for n,v in Kbest.items():
    v.fit(X,y)
    
    headerMask = v.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    clf =  RandomForestClassifier(criterion = 'gini', 
                                  min_samples_split = 0.01,
                                  n_estimators = 200,
                                  max_depth = 20,
                                  random_state = 0, verbose = 0)

    dfTrainK = dfTrain[selectedHeader]
    y_true, y_pred = dfTrainKChi2["CLASS"], cross_val_predict(clf, dfTrainK.drop(columns="CLASS"), dfTrainK["CLASS"], cv=5)

    print n
    print "acc:", accuracy_score(y_true=y_true, y_pred=y_pred)
    print "pre:", precision_score(y_true=y_true, y_pred=y_pred)
    print "rec:", recall_score(y_true=y_true, y_pred=y_pred) 

chi2
acc: 0.837857142857
pre: 0.843251088534
rec: 0.83
mutual_info_classif
acc: 0.740714285714
pre: 0.744557329463
rec: 0.732857142857
f_classif
acc: 0.826428571429
pre: 0.842578710645
rec: 0.802857142857


# 2.5.1 SVC

In [48]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## Use select k best for preprocessing.
## เพื่อที่จะลดเวลา train เท่านั้น
K = 100
X = dfTrain.drop(columns="CLASS")
y = dfTrain["CLASS"]
KBest = SelectKBest(score_func=chi2, k=K)
KBest = KBest.fit(X, y)

headerMask = KBest.get_support(True)
selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
print selectedHeader

Index([       u'54',       u'8mm',  u'anaconda',    u'arnold',     u'awful',
             u'bad',    u'batman',      u'bats',      u'bean',     u'bilko',
       ...
             u'war',     u'waste',    u'wasted',       u'wcw',    u'welles',
             u'why',     u'worst', u'wrestling',      u'zeta',     u'CLASS'],
      dtype='object', length=101)


In [49]:
dfTrainKBest = dfTrain[selectedHeader]
dfTrainKBest.head()

Unnamed: 0,54,8mm,anaconda,arnold,awful,bad,batman,bats,bean,bilko,...,war,waste,wasted,wcw,welles,why,worst,wrestling,zeta,CLASS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020527,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.034795,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027807,0.0,0.0,0.0,1.0


In [46]:
## Search for best parameters
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameter = [{	'C' : [1,10,100,1000],
                'kernel':['rbf'], 
                'gamma': [0.1,0.01,0.001,0.0001 ]}]
scoring = {"acc":make_scorer(accuracy_score), 
           "pre":make_scorer(precision_score,average="macro"), 
           "rec":make_scorer(recall_score,average="macro")}
    
classifier = SVC(decision_function_shape='ovo', verbose=1,random_state = 0)
grid_search = GridSearchCV( estimator = classifier,
                            param_grid = parameter,
                            scoring = scoring,
                            cv=5,
                            verbose = 1,
                            refit = "acc",
                            return_train_score = True)

In [50]:
grid_search = grid_search.fit( dfTrainKBest.drop(columns="CLASS"),
                                dfTrainKBest["CLASS"])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.3min finished


[LibSVM]

In [51]:
print(grid_search.best_params_)

mean_accs = grid_search.cv_results_['mean_test_acc']
mean_pres = grid_search.cv_results_['mean_test_pre']
mean_recs = grid_search.cv_results_['mean_test_rec']

mean_fitTime = grid_search.cv_results_['mean_fit_time']
mean_scrTime = grid_search.cv_results_['mean_score_time']

print("acc\tpre\trec\tftime\tstime")
for params, accu, pre, rec, ftime, stime in zip(grid_search.cv_results_['params'], mean_accs, mean_pres, mean_recs, mean_fitTime, mean_scrTime):
    print("%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\tfor %r"
          % (accu, pre,rec, ftime, stime, params))

{'kernel': 'rbf', 'C': 1000, 'gamma': 0.01}
acc	pre	rec	ftime	stime
0.71643	0.74681	0.71643	0.21796	0.15317	for {'kernel': 'rbf', 'C': 1, 'gamma': 0.1}
0.73071	0.75512	0.73071	0.22111	0.15591	for {'kernel': 'rbf', 'C': 1, 'gamma': 0.01}
0.73357	0.75680	0.73357	0.22183	0.15633	for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.73286	0.75612	0.73286	0.22394	0.15735	for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.73929	0.76818	0.73929	0.20693	0.14510	for {'kernel': 'rbf', 'C': 10, 'gamma': 0.1}
0.73071	0.75512	0.73071	0.22655	0.15971	for {'kernel': 'rbf', 'C': 10, 'gamma': 0.01}
0.73357	0.75680	0.73357	0.23235	0.16339	for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.73286	0.75612	0.73286	0.22999	0.16139	for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.79357	0.80395	0.79357	0.16005	0.10660	for {'kernel': 'rbf', 'C': 100, 'gamma': 0.1}
0.74500	0.77004	0.74500	0.21645	0.14904	for {'kernel': 'rbf', 'C': 100, 'gamma': 0.01}
0.73357	0.75680	0.73357	0.22909	0.16079	for {'kernel': 'rbf', 'C': 100,

In [52]:
## Search best K value.
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

K_list = [100, 1000, 10000, 25000]
# K_list = [50, 75, 100, 125, 150]
# K_list = [65, 70, 75, 80, 85]
# K_list = [79,80,81, 83, 85, 87, 89, 25000]
# K_list = [15000, 20000, 25000, 30000]
# K_list = [30000, 35000, 36000]
# K_list = [9500, 10000, 15000]

result = {}
for k in K_list:
    print k
    X = dfTrain.drop(columns="CLASS")
    y = dfTrain["CLASS"]
    KBest = SelectKBest(score_func=chi2, k=k)
    KBest = KBest.fit(X, y)

    headerMask = KBest.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    dfTrainKBest = dfTrain[selectedHeader]
    dfTestKBest = dfTest[selectedHeader]
    
    clf = SVC(decision_function_shape='ovo', kernel="rbf", C=1000, gamma=0.01, verbose=0, random_state = 0)
    clf.fit(dfTrainKBest.drop(columns="CLASS"), dfTrainKBest["CLASS"])
    
    y_true, y_pred = dfTestKBest["CLASS"], clf.predict(dfTestKBest.drop(columns="CLASS"))
    
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    result[k] = (acc, rec, pre)
print "finish"

100
[LibSVM]1000
[LibSVM]10000
[LibSVM]25000
[LibSVM]finish


In [53]:
result

{100: (0.78500000000000003, 0.84999999999999998, 0.75221238938053092),
 1000: (0.82999999999999996, 0.82999999999999996, 0.82999999999999996),
 10000: (0.82333333333333336, 0.83666666666666667, 0.81493506493506496),
 25000: (0.83833333333333337, 0.82333333333333336, 0.84879725085910651)}

# 2.5.1 neural_network MLPClassifier

In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## Use select k best for preprocessing.
## เพื่อที่จะลดเวลา train เท่านั้น
K = 100
X = dfTrain.drop(columns="CLASS")
y = dfTrain["CLASS"]
KBest = SelectKBest(score_func=chi2, k=K)
KBest = KBest.fit(X, y)

headerMask = KBest.get_support(True)
selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
print selectedHeader

Index([       u'54',       u'8mm',  u'anaconda',    u'arnold',     u'awful',
             u'bad',    u'batman',      u'bats',      u'bean',     u'bilko',
       ...
             u'war',     u'waste',    u'wasted',       u'wcw',    u'welles',
             u'why',     u'worst', u'wrestling',      u'zeta',     u'CLASS'],
      dtype='object', length=101)


In [19]:
dfTrainKBest = dfTrain[selectedHeader]
dfTrainKBest.head()

Unnamed: 0,54,8mm,anaconda,arnold,awful,bad,batman,bats,bean,bilko,...,war,waste,wasted,wcw,welles,why,worst,wrestling,zeta,CLASS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020527,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.034795,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027807,0.0,0.0,0.0,1.0


In [20]:
## Search for best parameters
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

parameter = [{	'activation' : ["identity", "logistic", "tanh", "relu"],
                'hidden_layer_sizes':[(100,), (50,50), (25,25,25,25)]}]
scoring = {"acc":make_scorer(accuracy_score), 
           "pre":make_scorer(precision_score,average="macro"), 
           "rec":make_scorer(recall_score,average="macro")}
    
classifier = MLPClassifier()
grid_search = GridSearchCV( estimator = classifier,
                            param_grid = parameter,
                            scoring = scoring,
                            cv=5,
                            verbose = 1,
                            refit = "acc",
                            return_train_score = True)

In [21]:
grid_search = grid_search.fit( dfTrainKBest.drop(columns="CLASS"),
                                dfTrainKBest["CLASS"])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   52.6s finished


In [22]:
print(grid_search.best_params_)

mean_accs = grid_search.cv_results_['mean_test_acc']
mean_pres = grid_search.cv_results_['mean_test_pre']
mean_recs = grid_search.cv_results_['mean_test_rec']

mean_fitTime = grid_search.cv_results_['mean_fit_time']
mean_scrTime = grid_search.cv_results_['mean_score_time']

print("acc\tpre\trec\tftime\tstime")
for params, accu, pre, rec, ftime, stime in zip(grid_search.cv_results_['params'], mean_accs, mean_pres, mean_recs, mean_fitTime, mean_scrTime):
    print("%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\tfor %r"
          % (accu, pre,rec, ftime, stime, params))

{'activation': 'identity', 'hidden_layer_sizes': (50, 50)}
acc	pre	rec	ftime	stime
0.80143	0.80590	0.80143	1.21064	0.00329	for {'activation': 'identity', 'hidden_layer_sizes': (100,)}
0.80571	0.80736	0.80571	0.46644	0.00326	for {'activation': 'identity', 'hidden_layer_sizes': (50, 50)}
0.79857	0.80231	0.79857	0.29030	0.00248	for {'activation': 'identity', 'hidden_layer_sizes': (25, 25, 25, 25)}
0.80000	0.80425	0.80000	1.96805	0.00609	for {'activation': 'logistic', 'hidden_layer_sizes': (100,)}
0.56500	0.36503	0.56500	0.52919	0.00725	for {'activation': 'logistic', 'hidden_layer_sizes': (50, 50)}
0.50000	0.25000	0.50000	0.06241	0.00405	for {'activation': 'logistic', 'hidden_layer_sizes': (25, 25, 25, 25)}
0.79929	0.80294	0.79929	1.55371	0.00517	for {'activation': 'tanh', 'hidden_layer_sizes': (100,)}
0.80071	0.80521	0.80071	0.68696	0.00458	for {'activation': 'tanh', 'hidden_layer_sizes': (50, 50)}
0.80429	0.80901	0.80429	0.46747	0.00571	for {'activation': 'tanh', 'hidden_layer_sizes': (2

In [23]:
## Search best K value.
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

K_list = [100, 1000, 10000, 25000]
# K_list = [50, 75, 100, 125, 150]
# K_list = [65, 70, 75, 80, 85]
# K_list = [79,80,81, 83, 85, 87, 89, 25000]
# K_list = [15000, 20000, 25000, 30000]
# K_list = [30000, 35000, 36000]
# K_list = [9500, 10000, 15000]

result = {}
for k in K_list:
    print k
    X = dfTrain.drop(columns="CLASS")
    y = dfTrain["CLASS"]
    KBest = SelectKBest(score_func=chi2, k=k)
    KBest = KBest.fit(X, y)

    headerMask = KBest.get_support(True)
    selectedHeader = dfTrain.columns[headerMask].append(pd.Index(["CLASS"]))
    
    dfTrainKBest = dfTrain[selectedHeader]
    dfTestKBest = dfTest[selectedHeader]
    
    clf = MLPClassifier(activation='identity', hidden_layer_sizes=(50, 50))
    clf.fit(dfTrainKBest.drop(columns="CLASS"), dfTrainKBest["CLASS"])
    
    y_true, y_pred = dfTestKBest["CLASS"], clf.predict(dfTestKBest.drop(columns="CLASS"))
    
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    result[k] = (acc, rec, pre)
print "finish"

100
1000
10000
25000
finish


In [24]:
result

{100: (0.78333333333333333, 0.81333333333333335, 0.76729559748427678),
 1000: (0.82166666666666666, 0.81000000000000005, 0.82935153583617749),
 10000: (0.83833333333333337, 0.82999999999999996, 0.84406779661016951),
 25000: (0.82166666666666666, 0.81666666666666665, 0.82491582491582494)}