## Goal of this notebook is to classify review into correct categories, so this is a classification problem

In [1]:
import pandas as pd 
import numpy as np
import os
from srs.utilities import Sentence,loadUsefulTrainingData,loadTrainingData,tokenize
from srs.maxEntropyModel import loadWordListDict
from sklearn.metrics import accuracy_score,classification_report

In [2]:
static_traning_data_dir = os.path.abspath('../srs/static_training_data/')
sentences = loadUsefulTrainingData(static_traning_data_dir)

In [3]:
for s in sentences:
    s.tokens = tokenize(s.content)

In [4]:
wordlist_dict_path = "../srs/predictor_data/wordlist_dict_1.txt"
feature_dict = loadWordListDict(wordlist_dict_path)
feature_list = []
for k in feature_dict.keys():
    feature_list = feature_list + feature_dict[k]
feature_list = list(set(feature_list)) #remove duplicates if any
print feature_list

[u'shot', u'color', u'fit', u'cheap', u'focus', u'bulki', u'design', u'batteri', u'simpli', u'size', u'use', u'detect', u'easi', u'pictur', u'screen', u'carri', u'big', u'simpl', u'price', u'smart', u'mode', u'beatiful', u'afford', u'auto', u'imag', u'qualiti', u'len', u'cheaper', u'pocket', u'pretti', u'charger', u'clear', u'zoom', u'easili', u'expens', u'small', u'video', u'nice', u'display']


In [5]:
s1 = sentences[1]
present_list = np.zeros(len(feature_list))
for token in s1.tokens:
    if token in feature_list:
        idx = feature_list.index(token)
        present_list[idx] += 1
print present_list

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  2.
  0.  0.  0.]


### Create Panda framework

In [6]:
df = pd.DataFrame(columns=feature_list)
target = pd.DataFrame(columns=['Prod_Feat'])
for s in sentences:
    count_list = np.zeros(len(feature_list))
    for token in s.tokens:
        if token in feature_list:
            idx = feature_list.index(token)
            count_list[idx] += 1
    row = pd.DataFrame([count_list],columns =feature_list)
    df = df.append(row,ignore_index=True)
    row_target = pd.DataFrame([s.labeled_aspects],columns=['Prod_Feat'])
    target = target.append(row_target,ignore_index=True)

In [7]:
df.head(5)

Unnamed: 0,shot,color,fit,cheap,focus,bulki,design,batteri,simpli,size,...,pretti,charger,clear,zoom,easili,expens,small,video,nice,display
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
target.head(5)

Unnamed: 0,Prod_Feat
0,pictures
1,size
2,battery
3,battery
4,battery


In [33]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.index, target, test_size=0.25, random_state=42)
X_train  = df.iloc[X_train]
X_test = df.iloc[X_test]

### Decision tree model construction, prediction, and visualization

In [99]:
from sklearn import tree
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
model = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=7)
model = model.fit(X_train, y_train)

Prediction and Analysis 

In [100]:
y_predicted = model.predict(X_test)
accuracy_score(y_test,y_predicted)

0.44571428571428573

In [36]:
target_names = target.Prod_Feat.unique()
print(classification_report(y_test, y_predicted, target_names=target_names))

             precision    recall  f1-score   support

   pictures       0.75      0.90      0.82        10
       size       0.00      0.00      0.00         8
    battery       0.00      0.00      0.00         6
     screen       0.38      0.42      0.40        19
       zoom       0.61      0.48      0.53        48
ease of use       0.00      0.00      0.00        10
  detection       0.00      0.00      0.00         3
    quality       0.71      0.71      0.71        17
      price       0.24      0.76      0.36        21
     design       0.00      0.00      0.00         6
      video       0.53      0.37      0.43        27

avg / total       0.43      0.45      0.42       175



  'precision', 'predicted', average, warn_for)


In [37]:
import pydot 
from IPython.display import Image  
from sklearn.externals.six import StringIO 
dot_data = StringIO()  
# tree.export_graphviz(model, out_file=dot_data,  
#                          feature_names=feature_list,  
#                          class_names=target_names,  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
with open("srs.dot", 'w') as f:
    f = tree.export_graphviz(model, out_file=f,  
                         feature_names=feature_list,  
                         class_names=target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
    # run on terminal 
    # $dot -Tpdf srs.dot -o srs.pdf

### Perform PCA dimension reduction 

In [67]:
from sklearn.decomposition import PCA
X_train_mat = X_train.values
pca = PCA(n_components=10)
pca.fit(X_train_mat)
print(pca.explained_variance_ratio_) 
X_train_pca = pca.transform(X_train_mat)
X_test_pca = pca.transform(X_test.values)

[ 0.17266077  0.13631522  0.09747978  0.08806372  0.06797617  0.05093011
  0.04555003  0.04183937  0.0350838   0.02646018]


Fit decision tree with PCA transformed data 

In [68]:
df_train_pca = pd.DataFrame(X_train_pca)
df_test_pca = pd.DataFrame(X_test_pca)

In [97]:
model_pca = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=7)
model_pca = model.fit(df_train_pca, y_train)
y_predicted = model.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

0.47428571428571431

In [70]:
print(classification_report(y_test, y_predicted, target_names=target_names))

             precision    recall  f1-score   support

   pictures       0.75      0.90      0.82        10
       size       0.12      0.12      0.12         8
    battery       0.00      0.00      0.00         6
     screen       0.46      0.63      0.53        19
       zoom       0.62      0.48      0.54        48
ease of use       0.67      0.40      0.50        10
  detection       0.00      0.00      0.00         3
    quality       0.69      0.65      0.67        17
      price       0.58      0.33      0.42        21
     design       0.00      0.00      0.00         6
      video       0.28      0.59      0.38        27

avg / total       0.49      0.47      0.46       175



###  Random Forest model 

In [78]:
from sklearn.ensemble import RandomForestClassifier
mdl_rf = RandomForestClassifier(n_estimators=5,min_samples_leaf=5,max_depth=7)
mdl_rf.fit(df_train_pca, y_train)
y_predicted = mdl_rf.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

  app.launch_new_instance()


0.49714285714285716

### Decision Tree with Adaboost

In [89]:
from sklearn.ensemble import AdaBoostClassifier
mdl_ada = AdaBoostClassifier(n_estimators=20,learning_rate=0.1)
mdl_ada.fit(df_train_pca, y_train)
y_predicted = mdl_ada.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

  y = column_or_1d(y, warn=True)


0.51428571428571423

### Neural Net Model 

In [91]:
from sklearn.neural_network import MLPClassifier
mdl_nn = MLPClassifier(hidden_layer_sizes=10,activation='logistic',algorithm='sgd')

ImportError: cannot import name MLPClassifier

### Naive Bayes 

In [96]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
mdl_GNB = GaussianNB()
mdl_GNB.fit(df_train_pca, y_train)
y_predicted = mdl_GNB.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

  y = column_or_1d(y, warn=True)


0.50857142857142856