## Review: what we did in Week 3: Amazon data
* Read Amazon.csv
* Get to know the data
* Create a smaller subset of the data
## [Jump to Week 4 material](#thisWeek)

In [1]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### read Amazon.csv

In [2]:
amazon = pd.read_csv('/Users/Aaron_hill/Dropbox/data/Amazon.csv')

### get to know the data

In [3]:
print("amazon is:", type(amazon))
print("amazon has", amazon.shape[0], "rows and", amazon.shape[1], "columns", "\n")
print("the data types for each of the columns in amazon:")
print(amazon.dtypes, "\n")
print("the first 10 rows in amazon:")
print(amazon.head(10))

amazon is: <class 'pandas.core.frame.DataFrame'>
amazon has 455000 rows and 13 columns 

the data types for each of the columns in amazon:
Unnamed: 0                  int64
Id                          int64
ProductId                  object
UserId                     object
ProfileName                object
HelpfulnessNumerator        int64
HelpfulnessDenominator      int64
Score                       int64
Time                        int64
Summary                    object
Text                       object
helpScore                 float64
helpful                      bool
dtype: object 

the first 10 rows in amazon:
   Unnamed: 0      Id   ProductId          UserId       ProfileName  \
0      138806  138807  B000E63LME  A1CQGW1AOD0LF2  Alena K. "Alena"   
1      469680  469681  B004ZIH4KM  A37S7U1OX2MCWI        Becky Cole   
2      238202  238203  B003ZXE9QA  A2OM6G73E64EQ9              jeff   
3      485307  485308  B001RVFERK  A25W349EE97NBK          Tangent4   
4      375283  3752

### create a smaller subset of the data

In [4]:
# create a subset of "amazon" that contains all the columns but only only the first 1000 rows
amazon_subset = amazon[:1000]
print(type(amazon_subset))
print(amazon_subset.shape)

<class 'pandas.core.frame.DataFrame'>
(1000, 13)


### create a ndarray for `L`

In [5]:
L = amazon_subset["helpful"]
print(type(L))
print(type(L.values))
print(L.shape)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
(1000,)


### create a ndarray for `X`
Use only "Score" and "Time" as features, for now.

In [6]:
X = amazon_subset[["Score", "Time"]]
print(type(X))
print(type(X.values))
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
(1000, 2)


### using the `SVC` class in `sklearn.svm`, fit the SVM model according to the given training data
For now, accept all the default parameters in `SVC`

In [7]:
from sklearn.svm import SVC
clf = SVC() # accepting all the default parameters
clf.fit(X, L)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### how well did we do? compare the model's predictions for  `Y` to the labels `L`


In [8]:
# number of accurate predictions
print(sum(clf.predict(X) == L.values))

986


In [9]:
import my_measures

clf_pm = my_measures.BinaryClassificationPerformance(clf.predict(X), L, 'clf')
clf_pm.compute_measures()
print(clf_pm.performance_measures)

{'Precision': 1.0, 'TP': 49, 'Pos': 63, 'Accuracy': 0.98599999999999999, 'Neg': 937, 'FN': 14, 'FP': 0, 'Recall': 0.77777777777777779, 'TN': 937}


## <a name='thisWeek'></a>Week 4: fit SVM using gradient descent and assess the fit of the model

### using the `SGDClassifier` class in `linear_model`, fit the model according to given training data

In [10]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier()
sgd.fit(X, L)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

### how well did we do? compare the model's predictions for  `Y` to the labels `L`
We'll start with the first few measures in Flach, p. 57

In [11]:
import my_measures

sgd_pm = my_measures.BinaryClassificationPerformance(sgd.predict(X), L, 'sgd')
sgd_pm.compute_measures()
print(sgd_pm.performance_measures)

{'Precision': nan, 'TP': 0, 'Pos': 63, 'Accuracy': 0.93700000000000006, 'Neg': 937, 'FN': 63, 'FP': 0, 'Recall': 0.0, 'TN': 937}


  self.performance_measures['Precision'] = self.performance_measures['TP'] / (self.performance_measures['TP'] + self.performance_measures['FP'])


### now try on full set of Amazon data

In [12]:
L = amazon["helpful"]
X = amazon[["Score", "Time"]]
sgd_full = linear_model.SGDClassifier()
sgd_full.fit(X, L)

sgd_full_pm = my_measures.BinaryClassificationPerformance(sgd_full.predict(X), L, 'sgd')
sgd_full_pm.compute_measures()
print(sgd_full_pm.performance_measures)

{'Precision': nan, 'TP': 0, 'Pos': 33235, 'Accuracy': 0.92695604395604392, 'Neg': 421765, 'FN': 33235, 'FP': 0, 'Recall': 0.0, 'TN': 421765}


  self.performance_measures['Precision'] = self.performance_measures['TP'] / (self.performance_measures['TP'] + self.performance_measures['FP'])
