In [18]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
products = pd.DataFrame.from_csv('amazon_baby_subset.csv')

In [4]:
products[:10]

Unnamed: 0_level_0,review,rating,sentiment
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book,All of my kids have cried non-stop when I trie...,5,1
Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
SoftPlay Peek-A-Boo Where's Elmo A Children's Book,Very cute interactive book! My son loves this ...,5,1
Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers,"Try this out for a spring project !Easy ,fun a...",5,1
Blessed By Pope Benedict XVI Divine Mercy Full Color Medal,very nice Divine Mercy Pendant of Jesus now on...,5,1
Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black),We bought the pins as my 6 year old Autistic s...,4,1
Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black),It has been many years since we needed diaper ...,5,1


In [10]:
products[products.sentiment == 1].shape

(26579, 3)

In [11]:
products[products.sentiment == -1].shape

(26493, 3)

In [13]:
important_words = pd.read_json('important_words.json')

In [14]:
# data clean
products = products.fillna({'review': ''})

In [19]:
regex = re.compile('[%s]' % re.escape(string.punctuation))
def remove_punctuation(text):
    return regex.sub('', text)
products['clean_review'] = products['review'].apply(remove_punctuation)

In [29]:
# features extractor
for word in important_words[0]:
    products[word] = products['clean_review'].apply(lambda s: s.split().count(word))

In [36]:
# features counting
for word in important_words[0]:
    products['contains_{}'.format(word)] = products[word].apply(lambda c: 1 if c >= 1 else 0)

In [45]:
# Quiz Question. How many reviews contain the word perfect?
products[products['contains_perfect'] == 1]['contains_perfect'].sum()

2955

In [85]:
# convert dataframe to multi-dimensional array
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_parray = dataframe[label]
    label_array = label_parray.as_matrix()
    return (feature_matrix.astype(float), label_array.astype(float))

In [86]:
features = important_words[0].values.tolist()

In [87]:
feature_matrix, label_array = get_numpy_data(products, features, 'sentiment')

In [88]:
# Quiz Question: How many features are there in the feature_matrix?
feature_matrix.shape[1]

194

In [57]:
# Quiz Question: Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model?

In [155]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix, coefficients)
    return 1 / (np.exp(-score) + 1)

In [156]:
def feature_derivative(errors, feature):
    derivative = np.dot(feature, errors)
    return derivative

In [157]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicators = (sentiment == 1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicators - 1)*scores - np.log(1+np.exp(-scores)))
    return lp

In [158]:
def logistic_regression(feature_matrix, sentiment, inital_coefficients, step_size, max_iter):
    D = feature_matrix.shape[1]
    coefficients = np.array(initial_coefficients)
    for n in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicators = (sentiment == 1)
        errors = indicators - predictions
        for i in range(D):
            feature = feature_matrix[:, i]
            derivative = feature_derivative(errors, feature)
            coefficients[i] += step_size*derivative
        if n%100 == 0:
            print('Running {}th iteration, current log-likehood is {}'.format(
                    n+1, compute_log_likelihood(feature_matrix, sentiment, coefficients)))
    return coefficients

In [159]:
feature_matrix = feature_matrix
sentiment = label_array
initial_coefficients = np.zeros(feature_matrix.shape[1])
step_size = 1e-7
max_iter = 301

In [160]:
# Quiz question: As each iteration of gradient ascent passes, does the log likelihood increase or decrease?
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

Running 1th iteration, current log-likehood is -36780.91768478126
Running 101th iteration, current log-likehood is -36230.46102346926
Running 201th iteration, current log-likehood is -35728.89418769386
Running 301th iteration, current log-likehood is -35268.51212682766


In [161]:
# Quiz question: How many reviews were predicted to have positive sentiment?
np.sum(np.dot(feature_matrix, coefficients) > 0)

25126

In [162]:
predictions = (np.dot(feature_matrix, coefficients) > 0).astype(int)

In [163]:
np.place(predictions, predictions == 1, 1)
np.place(predictions, predictions == 0, -1)

In [164]:
# Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)
sum(predictions == label_array) / len(label_array)

0.7518653904130238

In [165]:
coefficients = coefficients[1:]
word_coefficient_tuple = [(word, coefficient) for word, coefficient in zip(important_words[0], coefficients)]
word_coefficient_tuple = sorted(word_coefficient_tuple, key=lambda x: x[1], reverse=True)

In [166]:
word_coefficient_tuple[:10]

[('great', 0.066546084170457695),
 ('love', 0.065890762922123258),
 ('easy', 0.06479458680257838),
 ('little', 0.045435626308421372),
 ('loves', 0.044976401394906038),
 ('well', 0.030135001092107074),
 ('perfect', 0.029739937104968462),
 ('old', 0.020077541034775381),
 ('nice', 0.018408707995268992),
 ('daughter', 0.01770319990570169)]

In [168]:
word_coefficient_tuple[::-1][:10]

[('would', -0.053860148445203121),
 ('product', -0.041511033392108897),
 ('money', -0.038982037286487109),
 ('work', -0.033069515294752737),
 ('even', -0.030051249236035808),
 ('disappointed', -0.028978976142317068),
 ('get', -0.028711552980192588),
 ('back', -0.027742697230661327),
 ('return', -0.026592778462247283),
 ('monitor', -0.024482100545891717)]