In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
products = pd.read_csv('../data/amazon_baby_subset.csv')
important_words = list(pd.read_json('../data/important_words.json')[0])
train_id = pd.read_json('../data/module-4-assignment-train-idx.json')
validation_id = pd.read_json('../data/module-4-assignment-validation-idx.json')


In [26]:
print list(products)

['name', 'review', 'rating', 'sentiment']


In [27]:
products = products.fillna({'review':''})


In [28]:
def remove_punctuation(word):
    import string
    return word.translate(None,string.punctuation)

In [29]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [30]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda x: x.split(" ").count(word))

In [49]:
data_train = products.ix[list(train_id[0])]
data_validation = products.ix[list(validation_id[0])]


In [51]:
def get_numpy_data(data_frame,features,sentiment):
    data_frame['ones'] = 1
    features = ['ones'] + features
    feature_matrix = data_frame[features].as_matrix()
    sentiment_matrix = data_frame[sentiment].as_matrix()
    return feature_matrix,sentiment_matrix

In [52]:
feature_matrix_train,sentiment_train = get_numpy_data(data_train,
                                                     important_words,
                                                     'sentiment')
feature_matrix_valid,sentiment_valid = get_numpy_data(data_validation,
                                                     important_words,
                                                     'sentiment')

In [57]:
print "feature_matrix_train",feature_matrix_train.shape
print "feature_matrix_valid",feature_matrix_valid.shape

feature_matrix_train (42361, 194)
feature_matrix_valid (10711, 194)


In [58]:
def get_prob(feature_matrix,weight):
    score = np.dot(feature_matrix,weight)
    return 1/(1+np.exp(-score))

In [61]:
def feature_derivative(errors,feature):
    return np.dot(errors.transpose(),feature)

In [87]:
def feature_derivative_with_L2(errors,feature,coeff,l2_penalty,
                              feature_is_constant):
    old_derivative = feature_derivative(errors,feature)
    l2_term = 2*l2_penalty*coeff
    l2_term[0] = 0
    new_derivative = old_derivative - l2_term
    return new_derivative

In [88]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp

In [89]:
def logistic_regression_with_l2(feature_matrix,sentiment,initial_coeff,
                               step_size,l2_penalty,max_iter):
    for i in range(max_iter):
        prediction = get_prob(feature_matrix,initial_coeff)
        y = map(lambda x :1 if x ==1 else 0,sentiment)
        error = y - prediction
        grad = feature_derivative_with_L2(error,feature_matrix,initial_coeff,l2_penalty,
                                         'ones')
        initial_coeff = initial_coeff + step_size * grad
        l = compute_log_likelihood_with_L2(feature_matrix,sentiment,
                                          initial_coeff,l2_penalty)
        if i%50 == 0:
            print "l2_penalty : ",l2_penalty,"Iteration : ",i,"Likelihood : ",l
    return initial_coeff
        

In [90]:
feature_matrix = feature_matrix_train
sentiment = sentiment_train
initial_coeff = np.array([0]*194)
step_size = 5e-6
max_iter = 501
l2 = [0,4,10,1e2,1e3,1e5]
W = [logistic_regression_with_l2(feature_matrix,sentiment,initial_coeff,
                                step_size,l,max_iter) for l in l2]

l2_penalty :  0 Iteration :  0 Likelihood :  -29179.391383
l2_penalty :  0 Iteration :  50 Likelihood :  -24509.6359003
l2_penalty :  0 Iteration :  100 Likelihood :  -22794.9097492
l2_penalty :  0 Iteration :  150 Likelihood :  -21870.7744764
l2_penalty :  0 Iteration :  200 Likelihood :  -21283.2952735
l2_penalty :  0 Iteration :  250 Likelihood :  -20873.799416
l2_penalty :  0 Iteration :  300 Likelihood :  -20570.9748547
l2_penalty :  0 Iteration :  350 Likelihood :  -20337.6054165
l2_penalty :  0 Iteration :  400 Likelihood :  -20152.2146694
l2_penalty :  0 Iteration :  450 Likelihood :  -20001.4705739
l2_penalty :  0 Iteration :  500 Likelihood :  -19876.6233341
l2_penalty :  4 Iteration :  0 Likelihood :  -29179.3950818
l2_penalty :  4 Iteration :  50 Likelihood :  -24517.5209198
l2_penalty :  4 Iteration :  100 Likelihood :  -22813.4484458
l2_penalty :  4 Iteration :  150 Likelihood :  -21899.39001
l2_penalty :  4 Iteration :  200 Likelihood :  -21321.1416479
l2_penalty :  4 It

# Quiz 1)

## Likelihood increase

In [142]:
important_words = list(pd.read_json('../data/important_words.json')[0])
important_words = ['intercept'] + important_words
Coefficients = pd.DataFrame()
Coefficients['words'] = important_words
i = 0
for l in l2:
    Coefficients[l] = W[i]
    i+=1
Coefficients

Unnamed: 0,words,0,4,10,100.0,1000.0,100000.0
0,intercept,-0.063742,-0.063143,-0.062256,-0.050438,0.000054,0.011362
1,baby,0.074073,0.073994,0.073877,0.072360,0.059752,0.001784
2,one,0.012753,0.012495,0.012115,0.007247,-0.008761,-0.001827
3,great,0.801625,0.796897,0.789935,0.701425,0.376012,0.008950
4,love,1.058554,1.050856,1.039529,0.896644,0.418354,0.009042
5,use,-0.000104,0.000163,0.000556,0.005481,0.017326,0.000418
6,would,-0.287021,-0.286027,-0.284564,-0.265993,-0.188662,-0.008127
7,like,-0.003384,-0.003442,-0.003527,-0.004635,-0.007043,-0.000827
8,easy,0.984559,0.977600,0.967362,0.838245,0.401904,0.008808
9,little,0.524419,0.521385,0.516917,0.460235,0.251221,0.005941


In [144]:
Coefficients = Coefficients.sort(columns=[0])
Coefficients

  if __name__ == '__main__':


Unnamed: 0,words,0,4,10,100.0,1000.0,100000.0
106,disappointed,-0.955437,-0.946980,-0.934518,-0.775625,-0.266095,-0.004013
97,money,-0.768793,-0.762734,-0.753818,-0.641406,-0.275883,-0.005487
114,return,-0.742085,-0.735502,-0.725807,-0.602646,-0.215199,-0.003730
113,waste,-0.617809,-0.612475,-0.604620,-0.505189,-0.190631,-0.003345
169,returned,-0.572707,-0.567518,-0.559870,-0.462056,-0.150021,-0.002225
172,broke,-0.555195,-0.550209,-0.542861,-0.448989,-0.148726,-0.002182
78,work,-0.526716,-0.522912,-0.517312,-0.446447,-0.207047,-0.004781
100,thought,-0.477856,-0.473951,-0.468200,-0.395172,-0.157221,-0.003079
176,idea,-0.465370,-0.461130,-0.454879,-0.374890,-0.118469,-0.001627
134,cheap,-0.458912,-0.454909,-0.449010,-0.373794,-0.131506,-0.002066


In [151]:
negative_words_table = Coefficients[['words',0]][0:5]
positive_words_table = Coefficients[['words',0]][-5:]
print positive_words_table
print negative_words_table

      words         0
3     great  0.801625
34  perfect  0.835693
8      easy  0.984559
23    loves  1.052484
4      love  1.058554
            words         0
106  disappointed -0.955437
97          money -0.768793
114        return -0.742085
113         waste -0.617809
169      returned -0.572707


In [152]:
positive_words = positive_words_table['words']
negative_words = negative_words_table['words']

In [None]:
for i in l2:
    data_train['prediction'+str(i)] = np.do