## Goal of this notebook is to classify review into correct categories, so this is a classification problem

In [1]:
import pandas as pd 
import numpy as np
import os
from srs.utilities import Sentence,loadUsefulTrainingData,loadTrainingData,tokenize
from srs.maxEntropyModel import loadWordListDict
from srs.predictor import StaticPredictor
from sklearn.metrics import accuracy_score,classification_report

In [2]:
static_traning_data_dir = os.path.abspath('../srs/static_training_data/')
sentences = loadUsefulTrainingData(static_traning_data_dir)

In [3]:
for s in sentences:
    s.tokens = tokenize(s.content)

In [4]:
wordlist_dict_path = "../srs/predictor_data/wordlist_dict_1.txt"
feature_dict = loadWordListDict(wordlist_dict_path)
feature_list = []
for k in feature_dict.keys():
    feature_list = feature_list + feature_dict[k]
feature_list = list(set(feature_list)) #remove duplicates if any
print feature_list

[u'shot', u'color', u'fit', u'cheap', u'focus', u'bulki', u'design', u'batteri', u'simpli', u'size', u'use', u'detect', u'easi', u'pictur', u'screen', u'carri', u'big', u'simpl', u'price', u'smart', u'mode', u'beatiful', u'afford', u'auto', u'imag', u'qualiti', u'len', u'cheaper', u'pocket', u'pretti', u'charger', u'clear', u'zoom', u'easili', u'expens', u'small', u'video', u'nice', u'display']


In [5]:
print feature_dict

{u'battery': [u'batteri', u'charger'], u'pictures': [u'pictur', u'imag', u'shot'], u'price': [u'cheap', u'cheaper', u'expens', u'afford', u'price'], u'zoom': [u'zoom', u'len'], u'ease of use': [u'easi', u'simpl', u'easili', u'simpli', u'use'], u'detection': [u'detect', u'auto', u'mode', u'smart', u'focus'], u'design': [u'design', u'nice', u'beatiful', u'color', u'pretti'], u'video': [u'clear', u'video'], u'quality': [u'qualiti'], u'screen': [u'screen', u'display'], u'size': [u'size', u'big', u'small', u'fit', u'carri', u'pocket', u'bulki']}


In [6]:
s1 = sentences[1]
present_list = np.zeros(len(feature_list))
for token in s1.tokens:
    if token in feature_list:
        idx = feature_list.index(token)
        present_list[idx] += 1
print present_list

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  2.
  0.  0.  0.]


### Create Panda framework

In [None]:
df = pd.DataFrame(columns=feature_list)
target = pd.DataFrame(columns=['Prod_Feat'])
for s in sentences:
    count_list = np.zeros(len(feature_list))
    for token in s.tokens:
        if token in feature_list:
            idx = feature_list.index(token)
            count_list[idx] += 1
    row = pd.DataFrame([count_list],columns =feature_list)
    df = df.append(row,ignore_index=True)
    row_target = pd.DataFrame([s.labeled_aspects],columns=['Prod_Feat'])
    target = target.append(row_target,ignore_index=True)

In [None]:
df.head(5)

In [None]:
target.head(5)

In [None]:
from sklearn.cross_validation import train_test_split
train_idx, test_idx = train_test_split(df.index, test_size=0.25, random_state=42)
X_train  = df.iloc[train_idx]
X_test = df.iloc[test_idx]
y_train = target.iloc[train_idx]
y_test = target.iloc[test_idx]

### Decision tree model construction, prediction, and visualization

In [None]:
from sklearn import tree
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
model = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=7)
model = model.fit(X_train, y_train)

Prediction and Analysis 

In [None]:
y_predicted = model.predict(X_test)
accuracy_score(y_test,y_predicted)

In [None]:
target_names = target.Prod_Feat.unique()
print(classification_report(y_test, y_predicted, target_names=target_names))

In [None]:
import pydot 
from IPython.display import Image  
from sklearn.externals.six import StringIO 
dot_data = StringIO()  
# tree.export_graphviz(model, out_file=dot_data,  
#                          feature_names=feature_list,  
#                          class_names=target_names,  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
with open("srs.dot", 'w') as f:
    f = tree.export_graphviz(model, out_file=f,  
                         feature_names=feature_list,  
                         class_names=target_names,  
                         filled=True, rounded=True)  
    # run on terminal 
    # $dot -Tpdf srs.dot -o srs.pdf

### Max Entropy Model

training using the training data (take some time, uncomment to proceed) or skip this and use pre-trained lambda instead

In [None]:
# staticPredictor = StaticPredictor()
# wordlist_filename = 'wordlist_dict_1.txt'
# lamda_opt_filename = 'lambda_opt_75.txt'
# training_set = []
# for idx in train_idx:
#     training_set.append(sentences[idx])
# staticPredictor.train(wordlist_filename, lamda_opt_filename,training_set)

In [None]:
#predict using trained 
staticPredictor = StaticPredictor()
wordlist_filename = 'wordlist_dict_1.txt'
# param_filename = 'lambda_opt_regu3.txt'
param_filename = 'lambda_opt_75.txt'
staticPredictor.loadParams(param_filename)
staticPredictor.loadWordListDict(wordlist_filename)

# example of prediction
predicted_aspect = staticPredictor.predict(sentences[1])
print predicted_aspect

In [None]:
#test for accuracy using trained lambda
correct = 0.0
for idx in test_idx:
    predicted_aspect = staticPredictor.predict(sentences[idx])
    if predicted_aspect == sentences[idx].labeled_aspects:
        correct +=1

class_accuracy = correct/len(test_idx)
print 'The classification accuracy is: %.2f' % (class_accuracy)

### Perform PCA dimension reduction 

In [None]:
from sklearn.decomposition import PCA
X_train_mat = X_train.values
pca = PCA(n_components=10)
pca.fit(X_train_mat)
print(pca.explained_variance_ratio_) 
X_train_pca = pca.transform(X_train_mat)
X_test_pca = pca.transform(X_test.values)

Fit decision tree with PCA transformed data 

In [None]:
df_train_pca = pd.DataFrame(X_train_pca)
df_test_pca = pd.DataFrame(X_test_pca)

In [None]:
model_pca = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=7)
model_pca = model.fit(df_train_pca, y_train)
y_predicted = model.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

In [None]:
print(classification_report(y_test, y_predicted, target_names=target_names))

###  Random Forest model 

In [None]:
from sklearn.ensemble import RandomForestClassifier
mdl_rf = RandomForestClassifier(n_estimators=5,min_samples_leaf=5,max_depth=7)
mdl_rf.fit(df_train_pca, y_train)
y_predicted = mdl_rf.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

### Decision Tree with Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
mdl_ada = AdaBoostClassifier(n_estimators=20,learning_rate=0.1)
mdl_ada.fit(df_train_pca, y_train)
y_predicted = mdl_ada.predict(df_test_pca)
accuracy_score(y_test,y_predicted)

### Neural Net Model 

In [None]:
from sklearn.neural_network import MLPClassifier
mdl_nn = MLPClassifier(hidden_layer_sizes=10,activation='logistic',algorithm='sgd')

### Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
mdl_GNB = GaussianNB()
mdl_GNB.fit(df_train_pca, y_train)
y_predicted = mdl_GNB.predict(df_test_pca)
accuracy_score(y_test,y_predicted)