In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN 
%matplotlib inline

In [2]:
# graphviz and pydotplus are used for decision tree visualization, need to install graphviz to both windows system and python
# need to add path to windows environments, refer to this link 
# https://stackoverflow.com/questions/18438997/why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8

from IPython.display import Image 
import graphviz 
import pydotplus
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

# 1. Prepare and clean data

In [27]:
data=pd.read_csv("complete_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,season,year,month,rate,0,1,2,3,4,...,20,21,22,23,24,25,26,27,28,29
0,0,sm,18,Jul,D,0.001515,0.04697,0.001515,0.001515,0.001515,...,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.226225,0.001515,0.458624
1,1,sm,18,Jul,B,0.001852,0.001852,0.001852,0.001852,0.001852,...,0.001852,0.001852,0.001852,0.001852,0.001852,0.412335,0.224074,0.001852,0.001852,0.001852
2,2,sm,18,Jul,A,0.0,0.0,0.0,0.0,0.026646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867718
3,3,sm,18,Jul,C,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.093939,0.247035
4,4,sm,18,Jul,C,0.00119,0.00119,0.00119,0.036905,0.00119,...,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.716063


In [28]:
# integer encode
season=data.season
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(season)
integer_encoded

array([1, 1, 1, ..., 3, 1, 2])

In [29]:
## binary encode, oneHotEncoder
cat=integer_encoded.reshape(-1, 1) ## mush convert to integer and reshape to a 2D array
enc = OneHotEncoder()
enc.fit(cat) 
season_cat=enc.transform(cat).toarray()
season_group= pd.DataFrame(season_cat,columns=['a', 'sm', 'sp', 'w'])
season_group.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,a,sm,sp,w
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0


In [30]:
seasondata=pd.concat([data,season_group],axis=1).drop(['season','Unnamed: 0','year','month'],axis=1)
seasondata.head()

Unnamed: 0,rate,0,1,2,3,4,5,6,7,8,...,24,25,26,27,28,29,a,sm,sp,w
0,D,0.001515,0.04697,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,...,0.001515,0.001515,0.001515,0.226225,0.001515,0.458624,0.0,1.0,0.0,0.0
1,B,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,...,0.001852,0.412335,0.224074,0.001852,0.001852,0.001852,0.0,1.0,0.0,0.0
2,A,0.0,0.0,0.0,0.0,0.026646,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.867718,0.0,1.0,0.0,0.0
3,C,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.093939,0.247035,0.0,1.0,0.0,0.0
4,C,0.00119,0.00119,0.00119,0.036905,0.00119,0.00119,0.00119,0.00119,0.00119,...,0.00119,0.00119,0.00119,0.00119,0.00119,0.716063,0.0,1.0,0.0,0.0


In [31]:
newdata=data.drop(['season','Unnamed: 0','year','month'],axis=1)
newdata.head()

Unnamed: 0,rate,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,D,0.001515,0.04697,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,...,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.226225,0.001515,0.458624
1,B,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,...,0.001852,0.001852,0.001852,0.001852,0.001852,0.412335,0.224074,0.001852,0.001852,0.001852
2,A,0.0,0.0,0.0,0.0,0.026646,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867718
3,C,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.093939,0.247035
4,C,0.00119,0.00119,0.00119,0.036905,0.00119,0.00119,0.00119,0.00119,0.00119,...,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.716063


## why change 12345 to pos,neu, neg

In [32]:
## recode rate to 3 categories
## 五个精度低，特别少。。。  我们在意的是positive或者negtive。。而不是very positive或者very negtive。。
newdata.loc[data['rate']=="A", 'rate2']='pos'
newdata.loc[data['rate']=="B", 'rate2']='pos'
newdata.loc[data['rate']=="C", 'rate2']='neg'
newdata.loc[data['rate']=="D", 'rate2']='neg'
newdata.loc[data['rate']=="E", 'rate2']='neg'
data2=newdata.drop(['rate'],axis=1)
data2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,rate2
0,0.001515,0.04697,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,...,0.001515,0.001515,0.001515,0.001515,0.001515,0.001515,0.226225,0.001515,0.458624,neg
1,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,0.001852,...,0.001852,0.001852,0.001852,0.001852,0.412335,0.224074,0.001852,0.001852,0.001852,pos
2,0.0,0.0,0.0,0.0,0.026646,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867718,pos
3,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.093939,0.247035,neg
4,0.00119,0.00119,0.00119,0.036905,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,...,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.00119,0.716063,neg


In [56]:
## sum the weight for pos or neg, to estimate the direction of each topic
topicDirection=data2.groupby(['rate2']).mean()
topicDirection.to_csv('topicDirection.csv')

# 2. try random forest to explore data pre-processing

### A. use original data, without smote, accuracy is OK, but recall is too low

In [33]:
features=data2.drop(['rate2'],axis=1)
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf_fit=clf.fit(X_train, y_train)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.8346764346764347
[0.21939587 0.94633583]
[0.42592593 0.86979581]


### B. use original data, with smote to increase sample size in the minority classes(neg), results are better in terms of recall

In [34]:
features=data2.drop(['rate2'],axis=1)
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)#smote will sample more negative cases
X_res, y_res = sm.fit_sample(X_train, y_train)
clf = RandomForestClassifier()
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.7794871794871795
[0.31796502 0.86324293]
[0.29673591 0.87459807]


### C. set value <0.01 to 0 and use smote. OA decrease, but recall increase further

In [35]:
### smote通过增加 线性组合的sample把neg和postive的样本量变得差不多。。
##因为我们的数据大部分都是positive的。。
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier(random_state=42)
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.7504273504273504
[0.38314785 0.81708021]
[0.27542857 0.87950311]


In [36]:
importance=clf.feature_importances_
print(importance)

[0.01705221 0.01375589 0.02102566 0.01627143 0.0550151  0.01536639
 0.01926828 0.01452906 0.01399043 0.01282456 0.02798476 0.03386419
 0.12735638 0.02679686 0.04112131 0.01374588 0.02310897 0.02043821
 0.02993913 0.02377217 0.0167955  0.01304846 0.01474345 0.05261018
 0.01728642 0.11222741 0.02449408 0.07906662 0.01571044 0.08679057]


In [38]:
## this is the most important feature name
##用这个看重要不重要
largest5=importance.argsort()[-5:]
[list(features)[i] for i in largest5]

['4', '27', '29', '25', '12']

### C.2 set value <0.1 to 0, and use smote, OA decrease, but recall increase

In [39]:
features=data2.drop(['rate2'],axis=1)
features[features<0.1]=0
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier(random_state=42)
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.6979242979242979
[0.47535771 0.73831506]
[0.24792703 0.88577362]


### D. set value <0.01 to 0, and others to 1, use smote, recall is too low (this is equal to binary independent variables)

In [40]:
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
features[features>0.01]=1
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier()
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.7787545787545788
[0.15580286 0.89180612]
[0.20718816 0.85339591]


### I think both C.1 and C.2 are acceptable, depending on the focus. I use C.1 and the resulting most important features are ['4', '27', '29', '25', '12']

# 3. Implement decision tree following C.1 for interpretation purpose

In [41]:
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
label=data2.rate2
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.04697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226225,0.0,0.458624
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.412335,0.224074,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.026646,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867718
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093939,0.247035
4,0.0,0.0,0.0,0.036905,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.716063


In [42]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf=tree.DecisionTreeClassifier(max_depth=4)  ## this max_depth is after trials among [3,4,5]
clf=clf.fit(X_res, y_res)
y_predict=clf.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))

0.7643467643467643
[0.46581876 0.81852279]
[0.31778742 0.89410652]


In [46]:
with open("decisionTree_classifier.txt", "w") as f:
    f = tree.export_graphviz(clf, out_file=f,feature_names=list(features),class_names=["pos",'neg'])
#Use http://webgraphviz.com/ to see the tree graphic

In [44]:
print (list(features))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']


In [None]:
"""the result gini impurity look at how well the group been splitted. If all the group are Pos, then GINI is 0. If the pos and neg 
is 50%,50%, then the gini is 0.5, which is not good.48.97% means the 48.97% chance of a new data point being incorrectly 
classified, based on the observed training data we have at our disposal. 