# Need to find out what leads to quite different coef of the two packages

# Simple logistic regression for product reviews

In [110]:
## Load data
import pandas as pd
import sframe

In [111]:
from sklearn.cross_validation import train_test_split

In [112]:
import json

In [167]:
from sklearn.feature_extraction.text import CountVectorizer

In [154]:
#using pandas dataframe
records = pd.read_csv('./amazon_baby.csv')

In [114]:
#using SFrame
products = sframe.SFrame('./amazon_baby.gl/')

## 1. Data pre-processing

In [115]:
def remove_puncs(text):
    import string
    return  text.translate(None, string.punctuation)

In [116]:
products['review_clean'] = products['review'].apply(remove_puncs)

In [155]:
# fill na value in review column and eliminate the puncs in the review text
records.review.fillna('')
records['review_clean'] = products['review'].apply(remove_puncs)

In [156]:
records.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [157]:
#elimnate the reviews with rating equals 3 since they tend to be neutral
records = records[records['rating'] != 3]

In [158]:
records['sentiment'] = records.rating.apply(lambda x: +1 if x > 3 else -1)

## 2. Dataset split

In [160]:
train_idx = list(pd.read_json('module-2-assignment-train-idx.json',typ='series'))
test_idx = list(pd.read_json('module-2-assignment-test-idx.json',typ='series'))

In [163]:
records = records.reset_index().drop('index',1)

In [164]:
train_data = records[records.index.isin(train_idx)]
test_data = records[records.index.isin(test_idx)]

In [255]:
# Majority classifier
records[records['sentiment'] == 1].shape[0]

140259

In [256]:
records[records['sentiment'] == -1].shape[0]

26493

## 3. Build the word count vector

In [169]:
# initialize a word_count_vector
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

In [175]:
train_matrix = vectorizer.fit_transform(train_data['review_clean'])

In [176]:
test_matrix = vectorizer.transform(test_data['review_clean'])

## 4. Build a LogisticRegression Classifier

In [177]:
from sklearn.linear_model import  LogisticRegression

In [371]:
sentiment_model = LogisticRegression(fit_intercept=True,n_jobs=-1)

In [372]:
sentiment_model.fit(train_matrix,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [383]:
sentiment_model.intercept_

array([ 1.37952885])

In [386]:
coef = pd.DataFrame(sentiment_model.coef_.reshape(-1,),columns=['Coef'])

In [387]:
coef[coef['Coef'] > 0].shape

(86834, 1)

In [389]:
coef.sort_values('Coef',0,0)

Unnamed: 0,Coef
80934,2.014953
10112,2.011975
81177,1.993692
40372,1.950822
36155,1.950596
91891,1.905790
89901,1.865395
78982,1.865357
76453,1.835489
47504,1.834627


## 5. Make small sample predictions

In [217]:
sample_test_data = test_data[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
53,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
64,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
82,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [216]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.60132837  -3.16941194 -10.42411065]


In [218]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

In [247]:
sentiment_model.classes_

array([-1,  1], dtype=int64)

In [267]:
a = sentiment_model.predict_proba(sample_test_matrix)

## 6. Apply model to predict on the whole test set

In [334]:
test_data = test_data.fillna('')

In [335]:
test_data.isnull().sum()

name            0
review          0
rating          0
review_clean    0
sentiment       0
dtype: int64

In [336]:
test_matrix = vectorizer.transform(test_data['review_clean'])

In [337]:
sentiment_predc = pd.DataFrame(sentiment_model.predict_proba(test_matrix),columns=['Neg','Pos'])

In [342]:
test_data = test_data.reset_index().drop('index',1)

In [343]:
test_data['prob_positive'] = sentiment_predc['Pos']

In [346]:
test_data.sort_values('prob_positive',0,0).head(20).sort('rating',0,0)

  if __name__ == '__main__':


Unnamed: 0,name,review,rating,review_clean,sentiment,prob_positive
11923,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1,1.0
30634,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1,1.0
30076,Ikea 36 Pcs Kalas Kids Plastic BPA Free Flatwa...,For the price this set is unbelievable- and tr...,5,For the price this set is unbelievable and tru...,1,1.0
17558,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1,1.0
25554,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
9555,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1,1.0
20743,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.0
21531,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.0
18112,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1.0
26830,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1,1.0


In [345]:
test_data.isnull().sum()

name             0
review           0
rating           0
review_clean     0
sentiment        0
prob_positive    0
dtype: int64