In [169]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Read in the data

In [80]:
amazon = pd.read_csv('Amazon.csv')

### Explore data, type, dimensions etc.

In [81]:
print('amazon is', type(amazon) ) 
print(amazon.head(5))
amazon.shape

amazon is <class 'pandas.core.frame.DataFrame'>
   Unnamed: 0      Id   ProductId          UserId       ProfileName  \
0      138806  138807  B000E63LME  A1CQGW1AOD0LF2  Alena K. "Alena"   
1      469680  469681  B004ZIH4KM  A37S7U1OX2MCWI        Becky Cole   
2      238202  238203  B003ZXE9QA  A2OM6G73E64EQ9              jeff   
3      485307  485308  B001RVFERK  A25W349EE97NBK          Tangent4   
4      375283  375284  B000OQZNTS  A3CPPW0HUC07YS       Amy Nicolai   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       2      2  1294185600   
1                     0                       0      5  1349740800   
2                     0                       0      5  1329264000   
3                     1                       1      4  1248307200   
4                     0                       0      5  1333238400   

                     Summary  \
0           Not as pictured.   
1                      seeds   
2       

(455000, 13)

### subset the data

In [82]:
# create a subset of "amazon" that contains all the columns but only only the first 1000 rows
amazon_subset = amazon[:1000]
print(type(amazon_subset))
print(amazon_subset.shape)



<class 'pandas.core.frame.DataFrame'>
(1000, 13)


### List the column names

In [83]:
list(amazon_subset)

['Unnamed: 0',
 'Id',
 'ProductId',
 'UserId',
 'ProfileName',
 'HelpfulnessNumerator',
 'HelpfulnessDenominator',
 'Score',
 'Time',
 'Summary',
 'Text',
 'helpScore',
 'helpful']

### Create the Labels

In [84]:
Label = amazon_subset["helpful"]
Label.head(10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: helpful, dtype: bool

### This creates an ndarray and can be seen using ".values"

In [87]:
Label.shape
type(Label.values)

numpy.ndarray

### Creating the features (or selecting them from amazon_subset)

In [90]:
X = amazon_subset[["Score", "Time"]]
X.shape

(1000, 2)

### Support Vector Machines implementation 

In [94]:
from sklearn.svm import SVC
clf = SVC() # accepting all the default parameters
clf.fit(X, Label)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Check which are the bad predictions and find which ones they are

In [97]:
Y_pred = clf.predict(X)
len(Y_pred)

1000

In [103]:
print(range(len(Y_pred)))

range(0, 1000)


In [111]:
bad_predictions = []
for i in range(len(Y_pred)):
    if(Y_pred[i] != Label[i]):
        bad_predictions.append(i)

In [114]:
print(bad_predictions)
len(bad_predictions)

[92, 288, 298, 358, 413, 560, 747, 761, 781, 809, 819, 853, 880, 921]


14

### A false positive is when the truth is false but prediction is true 

### A false negative is when the truth is true but prediction is false

### Check for false positives and false negatives

In [139]:
true_vals = []
pred_vals = []
for i in range(len(bad_predictions)):
    pred_vals.append (Y_pred[bad_predictions[i]])
    true_vals.append (Label[bad_predictions[i]])

In [148]:
print(type(pred_vals))

<class 'list'>


In [141]:
print(true_vals)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [150]:
test_df = pd.DataFrame(
    {'predictions':pred_vals, 
     'truth' : true_vals })

### create a df where we see all mistakes are false negatives

In [151]:
test_df

Unnamed: 0,predictions,truth
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True
5,False,True
6,False,True
7,False,True
8,False,True
9,False,True


### END OF WEEK 3 - "use scikit learn, SVM to predict helpful scores in Amazon dataset"

### START OF WEEK 4

### Implementation of gradient descent

### 1. enumerate()
### 2. yield
### 3. zip
### 4. Fuck's up with his for loops?

In [153]:
# adapted from https://github.com/joelgrus/data-science-from-scratch
import math, random

In [155]:
def in_random_order(data):
    """generator that returns the elements of data in a random order"""
    indexes = [ i for i, in enumerate(data)] #create a list of indexes
    random.shuffle(indexes)                  #shuffle them
    for i in indexes:                        #return the data in that order
        yield data[i]

In [159]:
# linear algebra
def vector_subtract(v, w):
    """subtracts two vectors componentwise"""
    return [v_i - w_i for v_i, w_i in zip(v,w)]

def scalar_multiply(c, v):
    return[c * v_i for v_i in v]

In [162]:
#support for target and gradient functions
def predict(alpha, beta, x_i):
    return beta * x_i + alpha

def error(alpha, beta, x_i, y_i):
    return y_i - predict(alpha, beta, x_i)

#target and gradient functions
def squared_error(x_i, y_i, theta):
    alpha, beta = theta           #What is this line doing?
    return error(alpha, beta, x_i, y_i) ** 2

def squared_error_gradient(x_i, y_i, theta):
    alpha,beta = theta
    return[-2*error(alpha, beta, x_i, y_i),       #partial derivative w.r.t alpha
           -2*error(alpha, beta, x_i, y_i) * x_i] # partial derivative w.r.t beta


In [164]:
def minimize_stochastic(taget_fn, gradient_fn, x, y, theta_0, alpha_0 = 0.01):
    data = list(zip(x,y))
    theta = theta_0                           #initial guess
    alpha = alpha_0                           #initial step size
    min_theta, min_value = None, float("inf") #the minimum so far
    
    #if we ever go 1000 iterations with no improvement, stop
    while iterations_with_no_improvement < 100:
        value = sum(target_fn(x_i, y_i, theta) for x_i, y_i in data)
        
        if value < min_value:
            #if we've found a new minimum remember it
            #and go back to the original step size
            min_theta, min_value = theta, value
            iterations_with_no_improvement = 0
            alpha = alpha_0
        else:
            #otherwise we are not improving so shrink the step size
            iterations_with_no_improvement = iterations_with_no_improvement + 1
            alpha = alpha * 0.9
            
        #and take a gradient step for each of the data points
        for x_i, y_i in in_random_order(data):
            gradient_i = gradient_fn(x_i, y_i, theta)
            theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
            
        return min_theta

In [172]:
# data
x = np.array([1,2,4,3,5])
y = np.array([1,3,3,2,5])

### First try to calculate the slope and intercept using the formula

In [185]:
Xbar = np.average(x)
Ybar = np.average(y)
X_minus_Xbar = np.subtract(x, [Xbar])
Y_minus_Ybar = np.subtract(y, [Ybar])
X_minus_Xbar_squared = np.power(X_minus_Xbar, 2)
sum_X_minus_Xbar_times_Y_minus_Ybar = np.vdot(X_minus_Xbar, Y_minus_Ybar)
sum_X_minus_Xbar_squared = np.sum(X_minus_Xbar_squared)
b = sum_X_minus_Xbar_times_Y_minus_Ybar / sum_X_minus_Xbar_squared                

In [189]:
a = Ybar - (b*Xbar)

In [194]:
print("The intercept is", a, "\n")
print("The slope is", b)

The intercept is 0.4 

The slope is 0.8
