In [1]:
from __future__ import division
import graphlab
import math
import string

In [2]:
products = graphlab.SFrame('amazon_baby.gl/')

2016-04-07 12:20:24,327 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1460046022.log


This non-commercial license of GraphLab Create is assigned to jy14f@my.fsu.edu and will expire on March 28, 2017. For commercial licensing options, visit https://dato.com/buy/.


In [3]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 
review_without_puctuation = products['review'].apply(remove_punctuation)
products['word_count'] = graphlab.text_analytics.count_words(review_without_puctuation)

In [4]:
products = products[products['rating'] != 3]
len(products)

166752

In [5]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [6]:
train_data, test_data = products.random_split(.8, seed=1)
print len(train_data)
print len(test_data)

133416
33336


In [7]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                      target = 'sentiment',
                                                      features=['word_count'],
                                                      validation_set=None)

In [8]:
weights = sentiment_model.coefficients
weights.column_names()

['name', 'index', 'class', 'value', 'stderr']

In [9]:
num_positive_weights = (weights['value']>=0).sum()
num_negative_weights = (weights['value']<0).sum()

print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights

Number of positive weights: 68419 
Number of negative weights: 53294 


In [10]:
import numpy as np

In [11]:
test_data['scores']=test_data.apply(lambda x : sentiment_model.predict(x)[0])
test_data

name,review,rating,word_count,sentiment,scores
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'all': 1, 'standarad': 1, 'another': 1, 'when': ...",1,1.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,"{'all': 2, 'nannys': 1, 'just': 1, 'food': 1, ...",1,1.0
Nature's Lullabies First Year Sticker Calendar ...,"I love this little calender, you can keep ...",5.0,"{'and': 1, 'babys': 1, 'love': 1, 'like': 1, ...",1,-1.0
Nature's Lullabies Second Year Sticker Calendar ...,"I had a hard time finding a second year calendar, ...",5.0,"{'and': 3, 'all': 1, 'later': 1, 'reference': ...",1,1.0
"Lamaze Peekaboo, I Love You ...","One of baby's first and favorite books, and i ...",4.0,"{'and': 2, 'because': 1, 'just': 1, 'less': 1, ...",1,1.0
"Lamaze Peekaboo, I Love You ...",My son loved this book as an infant. It was ...,5.0,"{'infant': 1, 'being': 1, 'all': 1, 'course': 1, ...",1,1.0
"Lamaze Peekaboo, I Love You ...",Our baby loves this book & has loved it for a ...,5.0,"{'and': 1, 'own': 1, 'it': 3, 'our': 1, 'f ...",1,1.0
"SoftPlay Giggle Jiggle Funbook, Happy Bear ...",This bear is absolutely adorable and I would ...,2.0,"{'and': 3, 'rating': 1, 'have': 1, 'just': 1, ...",-1,1.0
SoftPlay Peek-A-Boo Where's Elmo A Childr ...,I bought two for recent baby showers! The book ...,5.0,"{'and': 2, 'beautiful': 1, 'love': 1, 'elmo': 1, ...",1,1.0
Baby's First Year Undated Wall Calendar with ...,I searched high and low for a first year cale ...,5.0,"{'remembering': 1, 'and': 4, 'year': 1, 'am': 1, ...",1,1.0


In [12]:
tmp=test_data

In [13]:
pd_test=graphlab.SFrame.to_dataframe(tmp)

In [14]:
(pd_test.sentiment==pd_test.scores).sum()/test_data.shape[0]

0.91453683705303579

In [15]:
(test_data['scores']==test_data['sentiment']).sum()/test_data.shape[0]

0.9145368370530358

In [17]:
def get_classification_accuracy(model, data, true_labels):
    # First get the predictions
    ## YOUR CODE HERE
    data['scores']=data.apply(lambda x : model.predict(x)[0])
    count1=(data['scores']==true_labels).sum()
    all1=data.shape[0]
    # Compute the number of correctly classified examples
    ## YOUR CODE HERE
    # Then compute accuracy by dividing num_correct by total number of examples
    ## YOUR CODE HERE
    accuracy=(count1)/all1    
    return accuracy

In [24]:
get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])

0.9145368370530358

In [18]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [19]:
train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)

In [20]:
simple_model = graphlab.logistic_classifier.create(train_data,
                                                   target = 'sentiment',
                                                   features=['word_count_subset'],
                                                   validation_set=None)
simple_model

Class                         : LogisticClassifier

Schema
------
Number of coefficients        : 21
Number of examples            : 133416
Number of classes             : 2
Number of feature columns     : 1
Number of unpacked features   : 20

Hyperparameters
---------------
L1 penalty                    : 0.0
L2 penalty                    : 0.01

Training Summary
----------------
Solver                        : newton
Solver iterations             : 6
Solver status                 : SUCCESS: Optimal solution found.
Training time (sec)           : 1.0865

Settings
--------
Log-likelihood                : 44323.7254

Highest Positive Coefficients
-----------------------------
word_count_subset[loves]      : 1.6773
word_count_subset[perfect]    : 1.5145
word_count_subset[love]       : 1.3654
(intercept)                   : 1.2995
word_count_subset[easy]       : 1.1937

Lowest Negative Coefficients
----------------------------
word_count_subset[disappointed]: -2.3551
word_count_subset[ret

In [30]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

0.8693004559635229

In [31]:
simple_model.coefficients

name,index,class,value,stderr
(intercept),,1,1.2995449552,0.0120888541331
word_count_subset,disappointed,1,-2.35509250061,0.0504149888557
word_count_subset,love,1,1.36543549368,0.0303546295109
word_count_subset,little,1,0.520628636025,0.0214691475665
word_count_subset,loves,1,1.67727145556,0.0482328275384
word_count_subset,product,1,-0.320555492996,0.0154311321362
word_count_subset,well,1,0.504256746398,0.021381300631
word_count_subset,great,1,0.94469126948,0.0209509926591
word_count_subset,easy,1,1.19366189833,0.029288869202
word_count_subset,work,1,-0.621700012425,0.0230330597946


In [32]:
simple_model.coefficients.sort('value', ascending=False).print_rows(num_rows=21)

+-------------------+--------------+-------+-----------------+-----------------+
|        name       |    index     | class |      value      |      stderr     |
+-------------------+--------------+-------+-----------------+-----------------+
| word_count_subset |    loves     |   1   |  1.67727145556  | 0.0482328275384 |
| word_count_subset |   perfect    |   1   |  1.51448626703  |  0.049861952294 |
| word_count_subset |     love     |   1   |  1.36543549368  | 0.0303546295109 |
|    (intercept)    |     None     |   1   |   1.2995449552  | 0.0120888541331 |
| word_count_subset |     easy     |   1   |  1.19366189833  |  0.029288869202 |
| word_count_subset |    great     |   1   |  0.94469126948  | 0.0209509926591 |
| word_count_subset |    little    |   1   |  0.520628636025 | 0.0214691475665 |
| word_count_subset |     well     |   1   |  0.504256746398 |  0.021381300631 |
| word_count_subset |     able     |   1   |  0.191438302295 | 0.0337581955697 |
| word_count_subset |     ol

In [33]:
(simple_model.coefficients.sort('value', ascending=False)['value']<=0)

dtype: int
Rows: 21
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [34]:
a=simple_model.coefficients[simple_model.coefficients['value']>0]
l=a['index']

In [35]:
b=sentiment_model.coefficients[sentiment_model.coefficients['value']>0]
l2=b['index']

In [36]:
for i in l:
    print  i in l2

True
True
True
True
True
True
True
True
True
True
True


In [37]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive
print num_negative

112164
21252


In [38]:
num_positive  = (test_data['sentiment'] == +1).sum()
num_negative = (test_data['sentiment'] == -1).sum()
print num_positive
print num_negative

28095
5241
