In [1]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
amazon = pd.read_csv('Amazon.csv')

In [4]:
print('amazon is', type(amazon) ) 
print(amazon.head(5))
amazon.shape

amazon is <class 'pandas.core.frame.DataFrame'>
   Unnamed: 0      Id   ProductId          UserId       ProfileName  \
0      138806  138807  B000E63LME  A1CQGW1AOD0LF2  Alena K. "Alena"   
1      469680  469681  B004ZIH4KM  A37S7U1OX2MCWI        Becky Cole   
2      238202  238203  B003ZXE9QA  A2OM6G73E64EQ9              jeff   
3      485307  485308  B001RVFERK  A25W349EE97NBK          Tangent4   
4      375283  375284  B000OQZNTS  A3CPPW0HUC07YS       Amy Nicolai   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       2      2  1294185600   
1                     0                       0      5  1349740800   
2                     0                       0      5  1329264000   
3                     1                       1      4  1248307200   
4                     0                       0      5  1333238400   

                     Summary  \
0           Not as pictured.   
1                      seeds   
2       

(455000, 13)

In [5]:
# create a subset of "amazon" that contains all the columns but only only the first 1000 rows
amazon_subset = amazon[:1000]
print(type(amazon_subset))
print(amazon_subset.shape)



<class 'pandas.core.frame.DataFrame'>
(1000, 13)


In [8]:
print(amazon.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455000 entries, 0 to 454999
Data columns (total 13 columns):
Unnamed: 0                455000 non-null int64
Id                        455000 non-null int64
ProductId                 455000 non-null object
UserId                    455000 non-null object
ProfileName               454985 non-null object
HelpfulnessNumerator      455000 non-null int64
HelpfulnessDenominator    455000 non-null int64
Score                     455000 non-null int64
Time                      455000 non-null int64
Summary                   454978 non-null object
Text                      455000 non-null object
helpScore                 238678 non-null float64
helpful                   455000 non-null bool
dtypes: bool(1), float64(1), int64(6), object(5)
memory usage: 42.1+ MB
None


### Pull out score time and text and then do tfidf on text

In [10]:
X = amazon_subset[["Score", "Time", "Text"]]
X.shape

(1000, 3)

### Load feature extraction to make a vector of text

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(min_df = 1)

In [14]:
text_col = amazon_subset.ix[:,'Text']

In [16]:
print(text_col.shape)

(1000,)


In [18]:
print(text_col.head(10))

0    I was looking forward to try cranberry apple f...
1    TY for everything.  The seeds arrived quickly,...
2    I've finally found the best cereal in the worl...
3    I originally bought these chips because I'd he...
4    Really excellent tea, flowers are visible in t...
5    I never was a big fan of tea but this is not l...
6    WOW WOW WOW WOW,WOW,truly great tasting Banana...
7    The words 'protein drink' don't generally conj...
8    We have 2 dogs, Dachshund and a German Shepher...
9    after trying most brands of coffee, I have set...
Name: Text, dtype: object


In [19]:
raw_array = vectorizer.fit_transform(text_col)

In [20]:
print(raw_array.toarray())

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [22]:
raw_array.shape

(1000, 6831)

### Get feature names (get_feature_names calls words uniquely)

In [23]:
unique_words = vectorizer.get_feature_names()

### Transform such that we ignore extremely frequent words

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

In [26]:
transformer = TfidfTransformer(smooth_idf=False)

### The original array (raw_array) is the argument for tfidf that pops out with weird numbers

In [29]:
tfidf = transformer.fit_transform(raw_array)
tfidf.shape

(1000, 6831)

In [30]:
transformed_array = tfidf.toarray()

In [32]:
transformed_array.shape

(1000, 6831)

In [35]:
Score = amazon_subset.ix[:,'Score']

In [37]:
Time = amazon_subset.ix[:,'Time']

In [42]:
new_df = pd.DataFrame(transformed_array, columns = unique_words)

In [45]:
features_df = pd.concat([Score, Time, new_df], axis = 1 )

### Create the array of features

In [47]:
features_df.shape

(1000, 6833)

### Create Label

In [48]:
Label = amazon_subset["helpful"]

In [50]:
from sklearn.svm import SVC
clf = SVC() # accepting all the default parameters
clf.fit(features_df, Label)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
Y_pred = clf.predict(features_df)

In [53]:
bad_predictions = []
for i in range(len(Y_pred)):
    if(Y_pred[i] != Label[i]):
        bad_predictions.append(i)

In [54]:
print(bad_predictions)
len(bad_predictions)

[92, 288, 298, 358, 413, 422, 499, 560, 595, 685, 747, 759, 761, 781, 807, 809, 819, 853, 880, 921]


20

In [56]:
true_vals = []
pred_vals = []
for i in range(len(bad_predictions)):
    pred_vals.append (Y_pred[bad_predictions[i]])
    true_vals.append (Label[bad_predictions[i]])

In [57]:
test_df = pd.DataFrame(
    {'predictions':pred_vals, 
     'truth' : true_vals })

In [58]:
test_df

Unnamed: 0,predictions,truth
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True
5,False,True
6,False,True
7,False,True
8,False,True
9,False,True


### Check how many "Helpful" reviews are there

In [65]:
len((Label[Label == True]))

63

In [66]:
class BinaryClassificationPerformance():
    '''Performance measures to evaluate the fit of a binary classification model'''
    
    def __init__(self, predictions, labels, probabilities=None):
        '''Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y'''
        '''probabilities-optional, probability that Y is equal to True'''
        self.probabilities = probabilities
        self.performance_df = pd.concat([pd.DataFrame(predictions), pd.DataFrame(labels)], axis=1)
        self.performance_df.columns = ['preds', 'labls']
        self.performance_measures = {}
  
    def compute_measures(self):
        '''Compute performance measures defined by Flach p. 57'''
        self.performance_measures['Pos'] = self.performance_df['preds'].sum()
        self.performance_measures['Neg'] = self.performance_df.shape[0] - self.performance_df['preds'].sum()
        self.performance_measures['TP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['TN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == True)).sum()

In [68]:
tc = BinaryClassificationPerformance(Y_pred, Label)
tc.compute_measures()
print(tc.performance_measures)

{'Neg': 957, 'FP': 0, 'TP': 43, 'FN': 20, 'Pos': 43, 'TN': 937}
