In [9]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
os.chdir('D:\APDS\Project\Yelp\py workspace')

In [5]:
df_recommended = pd.read_csv('..\\dataset\\RestaurantData\\recommended.csv')
df_not_recommended = pd.read_csv('..\\dataset\\RestaurantData\\not_recommended.csv')

In [6]:
df_recommended['label'] = 1
df_not_recommended['label'] = 0
frames = [df_recommended, df_not_recommended]
df = pd.concat(frames)

In [7]:
df.head()

Unnamed: 0,SNo,Review_Id,User_Id,Review_Text,label
0,0,tfR6cUgapL4qa0ayD-MXJQ,YEDnTf7d6RyXKEHDGqi9tg,Toronto hidden gem alert! I am so glad that I ...,1
1,1,b9FySx_x4XeBXzlVU3OC9w,waaZROsfwU7mKcaW8cINtg,We came here in search for a new Italian resta...,1
2,2,ER-O3ogvbFnKq2TUWPxhxQ,6oRhkk76MBoG3ky3AaBMsA,I had the worst service I could ever had in to...,1
3,3,GLj88OeSVnOH48u0lQ7JBw,OucFS92pVsGBwzELoBnWvA,Great experience tucked away in a side street ...,1
4,4,bBYMeN4NZBhkhmrIPBVEBA,KgFOJnWXxVuz0b-xfo6jjw,I was looking for a new Italian restaurant aro...,1


In [8]:
#StratifiedShuffleSplit so as to split data with evenly distributed classes in each set
def splitData(X, y, testSize):
    #X = np.array(df['text'])
    #y = np.array(df['label'])
    sss = StratifiedShuffleSplit(n_splits=1, test_size=testSize, random_state=0)
    sss.get_n_splits(X, y)
    
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return X_train, y_train, X_test, y_test

In [12]:
train_data, train_label, test_data, test_label = splitData(np.array(df['Review_Text']), np.array(df['label']), testSize=0.3)
train_data, train_label, val_data, val_label = splitData(train_data, train_label, 0.3)

In [13]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data)
X_train_counts.shape

(2165, 10566)

In [14]:
count_vect.vocabulary_.get(u'algorithm')

In [15]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2165, 10566)

In [16]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2165, 10566)

### Naive Bayes Classifier

In [20]:
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [21]:
clf.fit(X=train_data, y=train_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [29]:
predicted = clf.predict(X=val_data)
print('Validation data prediction mean : ', np.mean(predicted == val_label))

Validation data performance :  0.7079741379310345


In [30]:
predicted = clf.predict(X=test_data)
print('Test data prediction mean : ',np.mean(predicted==test_label))

Test data prediction mean :  0.7126696832579186


### SVM Classifier

In [32]:
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

clf.fit(train_data, train_label)  

predicted = clf.predict(val_data)
print('Validation data prediction mean : ',np.mean(predicted == val_label))

Validation data prediction mean :  0.7737068965517241


In [36]:
print(metrics.classification_report(val_label, predicted,
    target_names=['Fake', 'Genuine']))

             precision    recall  f1-score   support

       Fake       0.90      0.26      0.40       272
    Genuine       0.76      0.99      0.86       656

avg / total       0.80      0.77      0.73       928



In [35]:
metrics.confusion_matrix(val_label, predicted)

array([[ 70, 202],
       [  8, 648]], dtype=int64)

In [37]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [39]:
gs_clf = GridSearchCV(clf, parameters, cv=5, iid=False, n_jobs=-1)

In [40]:
gs_clf = gs_clf.fit(train_data, train_label)

In [41]:
gs_clf.best_score_

0.7565835874773892

In [42]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [43]:
resultsDf = pd.DataFrame(gs_clf.cv_results_)



In [44]:
resultsDf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_tfidf__use_idf,param_vect__ngram_range,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.419828,0.042235,0.089978,0.009424,0.01,True,"(1, 1)","{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...",0.707373,0.706697,...,0.70716,0.000642,6,0.707683,0.707275,0.707275,0.707275,0.706867,0.707275,0.000258
1,1.452225,0.059152,0.206156,0.011553,0.01,True,"(1, 2)","{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...",0.707373,0.706697,...,0.70716,0.000642,6,0.707106,0.707275,0.707275,0.707275,0.706867,0.707159,0.00016
2,0.43076,0.018591,0.101048,0.007042,0.01,False,"(1, 1)","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...",0.707373,0.709007,...,0.707622,0.000916,5,0.708839,0.70843,0.709007,0.709007,0.707444,0.708545,0.00059
3,1.319541,0.042333,0.18297,0.004718,0.01,False,"(1, 2)","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...",0.707373,0.706697,...,0.70716,0.000642,6,0.707106,0.707275,0.707275,0.707275,0.706867,0.707159,0.00016
4,0.410961,0.023825,0.093085,0.00591,0.001,True,"(1, 1)","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...",0.748848,0.750577,...,0.756584,0.008283,1,0.876372,0.858545,0.862587,0.853349,0.847663,0.859703,0.009725
5,1.41789,0.030012,0.192811,0.009279,0.001,True,"(1, 2)","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...",0.718894,0.720554,...,0.726101,0.006292,4,0.870017,0.854503,0.844111,0.84873,0.836122,0.850697,0.011379
6,0.449023,0.015017,0.086547,0.004927,0.001,False,"(1, 1)","{'clf__alpha': 0.001, 'tfidf__use_idf': False,...",0.741935,0.750577,...,0.746884,0.005296,2,0.822068,0.807159,0.800808,0.803695,0.790537,0.804854,0.010241
7,1.339426,0.0921,0.191633,0.029558,0.001,False,"(1, 2)","{'clf__alpha': 0.001, 'tfidf__use_idf': False,...",0.732719,0.743649,...,0.743656,0.006729,3,0.848065,0.831986,0.821594,0.83776,0.811887,0.830258,0.012561


In [45]:
predicted = gs_clf.predict(val_data)
np.mean(predicted==val_label)

0.7737068965517241

In [46]:
predicted = gs_clf.predict(test_data)
np.mean(predicted==test_label)

0.7699849170437406