In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder #get dummy variables for categories
from sklearn.preprocessing import LabelEncoder #get dummy variables for categories  (the pandas get dummies is better)
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import ADASYN 
%matplotlib inline

In [4]:
# graphviz and pydotplus are used for decision tree visualization, need to install graphviz to both windows system and python need to add path to windows environments, refer to this link  https://stackoverflow.com/questions/18438997/why-is-pydot-unable-to-find-graphvizs-executables-in-windows-8
from IPython.display import Image 
import graphviz 
import pydotplus
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

# 1. Prepare and clean data

In [5]:
os.chdir(r'D:\NDSUGoogle\Pyntebook\nlp analysis\wangruiziResult')
data=pd.read_csv("complete_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,season,year,month,rate,0,1,2,3,4,...,20,21,22,23,24,25,26,27,28,29
0,5748,sp,17,Mar,C,0.002381,0.57381,0.002381,0.002381,0.002381,...,0.002381,0.002381,0.002381,0.002381,0.002381,0.359524,0.002381,0.002381,0.002381,0.002381
1,7853,a,16,Nov,D,0.093939,0.366667,0.00303,0.00303,0.093939,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.366667,0.00303,0.00303,0.00303,0.00303
2,20335,sp,12,Mar,B,0.003704,0.337037,0.003704,0.003704,0.225926,...,0.003704,0.003704,0.003704,0.003704,0.003704,0.337037,0.003704,0.003704,0.003704,0.003704
3,906,sp,18,Apr,A,0.001449,0.305797,0.001449,0.001449,0.001449,...,0.001449,0.001449,0.001449,0.001449,0.001449,0.0889,0.001449,0.001449,0.001449,0.001449
4,20401,w,11,Dec,A,0.00303,0.275758,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.639394,0.00303,0.00303,0.00303,0.00303


In [6]:
dummies=data['season'].str.get_dummies()
dummies.head()

Unnamed: 0,a,sm,sp,w
0,0,0,1,0
1,1,0,0,0
2,0,0,1,0
3,0,0,1,0
4,0,0,0,1


In [7]:
seasondata=pd.concat([data,dummies],axis=1).drop(['Unnamed: 0'],axis=1)
seasondata.head()

Unnamed: 0,season,year,month,rate,0,1,2,3,4,5,...,24,25,26,27,28,29,a,sm,sp,w
0,sp,17,Mar,C,0.002381,0.57381,0.002381,0.002381,0.002381,0.002381,...,0.002381,0.359524,0.002381,0.002381,0.002381,0.002381,0,0,1,0
1,a,16,Nov,D,0.093939,0.366667,0.00303,0.00303,0.093939,0.00303,...,0.00303,0.366667,0.00303,0.00303,0.00303,0.00303,1,0,0,0
2,sp,12,Mar,B,0.003704,0.337037,0.003704,0.003704,0.225926,0.003704,...,0.003704,0.337037,0.003704,0.003704,0.003704,0.003704,0,0,1,0
3,sp,18,Apr,A,0.001449,0.305797,0.001449,0.001449,0.001449,0.001449,...,0.001449,0.0889,0.001449,0.001449,0.001449,0.001449,0,0,1,0
4,w,11,Dec,A,0.00303,0.275758,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.639394,0.00303,0.00303,0.00303,0.00303,0,0,0,1


In [8]:
newdata=data.drop(['season','Unnamed: 0','year','month'],axis=1)
newdata.head()

Unnamed: 0,rate,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,C,0.002381,0.57381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,...,0.002381,0.002381,0.002381,0.002381,0.002381,0.359524,0.002381,0.002381,0.002381,0.002381
1,D,0.093939,0.366667,0.00303,0.00303,0.093939,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.366667,0.00303,0.00303,0.00303,0.00303
2,B,0.003704,0.337037,0.003704,0.003704,0.225926,0.003704,0.003704,0.003704,0.003704,...,0.003704,0.003704,0.003704,0.003704,0.003704,0.337037,0.003704,0.003704,0.003704,0.003704
3,A,0.001449,0.305797,0.001449,0.001449,0.001449,0.001449,0.001449,0.001449,0.001449,...,0.001449,0.001449,0.001449,0.001449,0.001449,0.0889,0.001449,0.001449,0.001449,0.001449
4,A,0.00303,0.275758,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.00303,0.639394,0.00303,0.00303,0.00303,0.00303


## why change 12345 to pos,neu, neg

In [9]:
## recode rate to 3 categories
newdata.loc[data['rate']=="A", 'rate2']='pos'
newdata.loc[data['rate']=="B", 'rate2']='pos'
newdata.loc[data['rate']=="C", 'rate2']='neg'
newdata.loc[data['rate']=="D", 'rate2']='neg'
newdata.loc[data['rate']=="E", 'rate2']='neg'
data2=newdata.drop(['rate'],axis=1)
data2.head()
## 五个精度低，特别少。。。  我们在意的是positive或者negtive。。而不是very positive或者very negtive。。

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,rate2
0,0.002381,0.57381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,...,0.002381,0.002381,0.002381,0.002381,0.359524,0.002381,0.002381,0.002381,0.002381,neg
1,0.093939,0.366667,0.00303,0.00303,0.093939,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.366667,0.00303,0.00303,0.00303,0.00303,neg
2,0.003704,0.337037,0.003704,0.003704,0.225926,0.003704,0.003704,0.003704,0.003704,0.003704,...,0.003704,0.003704,0.003704,0.003704,0.337037,0.003704,0.003704,0.003704,0.003704,pos
3,0.001449,0.305797,0.001449,0.001449,0.001449,0.001449,0.001449,0.001449,0.001449,0.001449,...,0.001449,0.001449,0.001449,0.001449,0.0889,0.001449,0.001449,0.001449,0.001449,pos
4,0.00303,0.275758,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,0.00303,...,0.00303,0.00303,0.00303,0.00303,0.639394,0.00303,0.00303,0.00303,0.00303,pos


In [10]:
## sum the weight for pos or neg, to estimate the direction of each topic
topicDirection=data2.groupby(['rate2']).mean()
topicDirection.to_csv('topicDirection.csv')

# 2. try random forest to explore data pre-processing

### A. use original data, without smote, accuracy is OK, but recall is too low

In [11]:
features=data2.drop(['rate2'],axis=1)
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf_fit=clf.fit(X_train, y_train)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
## the fraction of correctly classified samples by fit y_test comparing to the y_predict 
print(recall_score(y_test, y_predict, average=None)) 
## the ability to find all the positive samples, the output is the percentage of each label, 
##[0.21,0.94] the 0.21 is for neg, the 0.94 is for pos
print(precision_score(y_test, y_predict, average=None)) 
##the ability of not to label as positive a sample that is negative, the output is the percentage of each label




0.8278388278388278
[0.17892977 0.93880469]
[0.33333333 0.86989931]


### B. use original data, with smote to increase sample size in the minority classes(neg), results are better in terms of recall

In [14]:
features=data2.drop(['rate2'],axis=1)
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)#smote will sample more negative cases in traning set, because we have imblanced data, too many pos, too few neg;
X_res, y_res = sm.fit_sample(X_train, y_train)
clf = RandomForestClassifier()
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
#evaluate y_predict performance
from sklearn.metrics import classification_report, confusion_matrix
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_predict))
print('\n')




=== Confusion Matrix ===
[[ 176  422]
 [ 472 3025]]


=== Classification Report ===
              precision    recall  f1-score   support

         neg       0.27      0.29      0.28       598
         pos       0.88      0.87      0.87      3497

    accuracy                           0.78      4095
   macro avg       0.57      0.58      0.58      4095
weighted avg       0.79      0.78      0.79      4095



0.7816849816849817
[0.29431438 0.86502717]
[0.27160494 0.8775747 ]


### C. set value <0.01 to 0 and use smote. OA decrease, but recall increase further

In [16]:
### smote通过增加 线性组合的sample把neg和postive的样本量变得差不多。。
##因为我们的数据大部分都是positive的。。
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier(random_state=42)
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
#evaluate y_predict performance
from sklearn.metrics import classification_report, confusion_matrix
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_predict))
print('\n')



=== Confusion Matrix ===
[[ 210  388]
 [ 667 2830]]


=== Classification Report ===
              precision    recall  f1-score   support

         neg       0.24      0.35      0.28       598
         pos       0.88      0.81      0.84      3497

    accuracy                           0.74      4095
   macro avg       0.56      0.58      0.56      4095
weighted avg       0.79      0.74      0.76      4095





In [17]:
importance=clf.feature_importances_
print(importance)

[0.01626663 0.01401206 0.02289325 0.01449951 0.04851556 0.01449996
 0.01841391 0.01458192 0.01699438 0.0132302  0.02545352 0.04607786
 0.12169141 0.02518668 0.03357851 0.01346844 0.02044328 0.02118503
 0.02848735 0.02028432 0.0169261  0.01339858 0.01672614 0.05691043
 0.0147845  0.10984689 0.02338394 0.08878932 0.01760028 0.09187004]


In [18]:
## this is the most important feature name
##用这个看重要不重要
largest5=importance.argsort()[-5:]
[list(features)[i] for i in largest5]

['23', '27', '29', '25', '12']

### C.2 set value <0.1 to 0, and use smote, OA decrease, but recall increase

In [25]:
features=data2.drop(['rate2'],axis=1) #features is the 
features[features<0.1]=0
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier(random_state=42)
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))




0.6935286935286935
[0.44481605 0.73605948]
[0.22371741 0.88575361]



### D. set value <0.01 to 0, and others to 1, use smote, recall is too low (this is equal to binary independent variables)

In [19]:
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
features[features>0.01]=1
label=data2.rate2
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf = RandomForestClassifier()
clf_fit=clf.fit(X_res, y_res)
y_predict=clf_fit.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))



0.7821733821733822
[0.16722408 0.887332  ]
[0.20242915 0.86170508]


### I think both C.1 and C.2 are acceptable, depending on the focus. I use C.1 and the resulting most important features are ['4', '27', '29', '25', '12']

# 3. Implement decision tree following C.1 for interpretation purpose

In [20]:
features=data2.drop(['rate2'],axis=1)
features[features<0.01]=0
label=data2.rate2
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.57381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.359524,0.0,0.0,0.0,0.0
1,0.093939,0.366667,0.0,0.0,0.093939,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.366667,0.0,0.0,0.0,0.0
2,0.0,0.337037,0.0,0.0,0.225926,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.337037,0.0,0.0,0.0,0.0
3,0.0,0.305797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0889,0.0,0.0,0.0,0.0
4,0.0,0.275758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.639394,0.0,0.0,0.0,0.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42,ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)

clf=tree.DecisionTreeClassifier(max_depth=4)  ## this max_depth is after trials among [3,4,5]
clf=clf.fit(X_res, y_res)
y_predict=clf.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(recall_score(y_test, y_predict, average=None))
print(precision_score(y_test, y_predict, average=None))

0.6903540903540903
[0.53846154 0.71632828]
[0.24505327 0.90075512]


In [22]:
with open("decisionTree_classifier.txt", "w") as f:
    f = tree.export_graphviz(clf, out_file=f,feature_names=list(features),class_names=["pos",'neg'])
#Use http://webgraphviz.com/ to see the tree graphic

In [23]:
print (list(features))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']


In [None]:
"""the result gini impurity look at how well the group been splitted. If all the group are Pos, then GINI is 0. If the pos and neg 
is 50%,50%, then the gini is 0.5, which is not good.48.97% means the 48.97% chance of a new data point being incorrectly 
classified, based on the observed training data we have at our disposal. 