# W2_Implementing Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import string

In [2]:
data = pd.read_csv('amazon_baby_subset.csv')
data.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [3]:
# explore more of this dataset
list(data.iloc[:10]['name'])

["Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book",
 "Nature's Lullabies Second Year Sticker Calendar",
 "Nature's Lullabies Second Year Sticker Calendar",
 'Lamaze Peekaboo, I Love You',
 "SoftPlay Peek-A-Boo Where's Elmo A Children's Book",
 'Our Baby Girl Memory Book',
 'Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers',
 'Blessed By Pope Benedict XVI Divine Mercy Full Color Medal',
 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)',
 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)']

In [4]:
print('# positive:', sum(data['sentiment']==1))
print('# negative:', sum(data['sentiment']==-1))

# positive: 26579
# negative: 26493


In [5]:
important_words = list(pd.read_json('important_words.json')[0])
important_words[:5] # 193 words

['baby', 'one', 'great', 'love', 'use']

In [6]:
# text cleaning
data = data.fillna({'review':''})

In [7]:
def remove_punctuation(text):
    table = str.maketrans(dict.fromkeys(string.punctuation))
    return text.translate(table)

data['cleaned_review'] = data['review'].apply(remove_punctuation)
data.head()

Unnamed: 0,name,review,rating,sentiment,cleaned_review
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


In [8]:
for word in important_words:
    data[word] = data['cleaned_review'].apply(lambda s: s.split().count(word))

In [9]:
data.head(1)

Unnamed: 0,name,review,rating,sentiment,cleaned_review,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
(data['perfect']>0).sum()

2955

In [11]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    feature_matrix = dataframe[features].values
    label_array = dataframe[label].values
    return(feature_matrix, label_array)

In [12]:
feature_matrix, label_array = get_numpy_data(data, important_words, 'sentiment')
feature_matrix.shape

(53072, 194)

In [13]:
# estimate conditional probability with link function
def predict_probability(feature_matrix, weights):
    score = np.dot(feature_matrix, weights)
    prob_predictions = 1 / (1 + np.exp(-score))
    return prob_predictions

In [14]:
# compute derivative of log likelihood with respect to a single coefficient
def feature_derivative(errors, feature):    
    derivative = np.dot(errors, feature)
    return derivative

In [15]:
# use log-likelihood to assess algorithm
def compute_log_likelihood(feature_matrix, label_array, weights):
    indicator = (label_array==1)
    scores = np.dot(feature_matrix, weights)
    lp = np.sum((indicator-1) * scores - np.log(1. + np.exp(-scores)))
    return lp

In [16]:
# implement gradient ascent
def logistic_regression(feature_matrix, label_array, init_w, step_size, iteration):
    weights = init_w
    indicator = (label_array==1)    
    itr = 0
    while itr < iteration:
        prob_predicions = predict_probability(feature_matrix, weights)
        errors = indicator - prob_predicions

        for index in range(len(weights)):
            derivative = feature_derivative(errors, feature_matrix[:, index])
            weights[index] = weights[index] + step_size * derivative
        itr += 1
        
        # check whether log likelihood is increasing
        if (itr <= 15) or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, label_array, weights)
            print('iteration {}: {}'.format(itr, lp))

    return weights

In [17]:
# run a logistic regression solver 
weights_1 = logistic_regression(feature_matrix, label_array, np.zeros(194), 1e7, 301)

  """
  after removing the cwd from sys.path.


iteration 1: -inf
iteration 2: -4760059840000000.0
iteration 3: -inf
iteration 4: -inf
iteration 5: -inf
iteration 6: -inf
iteration 7: -inf
iteration 8: -inf
iteration 9: -inf
iteration 10: -inf
iteration 11: -inf
iteration 12: -inf
iteration 13: -inf
iteration 14: -inf
iteration 15: -inf
iteration 20: -inf
iteration 30: -inf
iteration 40: -inf
iteration 50: -inf
iteration 60: -inf
iteration 70: -inf
iteration 80: -inf
iteration 90: -inf
iteration 100: -inf
iteration 200: -inf
iteration 300: -inf


In [18]:
# predict sentiments
scores = np.dot(feature_matrix, weights_1)
v_fun = np.vectorize(lambda x: 1 if x>=0 else -1)
estimated_sentiment = v_fun(scores)
# (estimated_sentiment == 1).sum()
print('# total:', feature_matrix.shape[0])
print('# positive:', sum(estimated_sentiment == 1))
print('# negative:', sum(estimated_sentiment == -1))

# total: 53072
# positive: 13590
# negative: 39482


In [19]:
# measure accuracy
accuracy = sum(estimated_sentiment == label_array) / len(label_array)
accuracy

0.71489674404582459

In [20]:
# find which words contribute most to positive & negative sentiments
weights_1_list = list(weights_1[1:]) # exclude intercept
words_weights = [(word, weight) for word, weight in zip(important_words, weights_1_list)]
words_weights = sorted(words_weights, key=lambda x:x[1], reverse=True)

In [21]:
# ten "most positive" words
words_weights[:10]

[('loves', 178740000000.0),
 ('perfect', 167770000000.0),
 ('love', 155535000000.0),
 ('best', 152965000000.0),
 ('easy', 150350000000.0),
 ('happy', 144005000000.0),
 ('fits', 138775000000.0),
 ('great', 110845000000.0),
 ('comfortable', 92660000000.0),
 ('bit', 87685000000.0)]

In [22]:
# ten "most negative" words
words_weights[-10:]

[('maybe', -101350000000.0),
 ('difficult', -108600000000.0),
 ('cheap', -111260000000.0),
 ('money', -111875000000.0),
 ('idea', -147615000000.0),
 ('waste', -163665000000.0),
 ('broke', -166175000000.0),
 ('returned', -226200000000.0),
 ('return', -229915000000.0),
 ('disappointed', -266500000000.0)]