Sentiment analysis via logistic regression

In [None]:
import pandas as pd
import json, gzip
from urllib.request import urlopen
import multiprocessing as mp
from cytoolz import *
from ftfy import fix_text

## Download and clean up text

In [None]:
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Patio_Lawn_and_Garden_5.json.gz'

In [None]:
data = gzip.decompress(urlopen(url).read())
data = data.splitlines()

In [None]:
df = pd.DataFrame(json.loads(line) for line in data)

In [None]:
df.groupby('overall').size()

Get rid of four-star reviews (too wishy-washy!)

In [None]:
df = df[df['overall']!=4].copy()

In [None]:
df['sentiment'] = ['good' if s>4 else 'bad' for s in df['overall']]

In [None]:
pd.value_counts(df['sentiment'])

In [None]:
with mp.Pool() as p:
    df['reviewText'] = p.map(fix_text, df['reviewText'])

## Parse review texts

In [None]:
import spacy
from spacy import displacy
from spacy.tokens import Token
nlp = spacy.load('en', disable=['ner'])

In [None]:
df['doc'] = list(nlp.pipe(df['reviewText']))

## Baseline classifier

In [None]:
from sklearn.feature_extraction.text import *
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.dummy import *

In [None]:
train,test = train_test_split(df.index,
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=42)

In [None]:
model0 = make_pipeline(CountVectorizer(analyzer=identity),DummyClassifier('most_frequent'))
model0.fit(df.loc[train]['reviewText'], df.loc[train]['sentiment'])
model0.score(df.loc[test]['reviewText'], df.loc[test]['sentiment']) * 100.

## Logistic regression

In [None]:
def tokens(doc):
    return [tok.lower_ for tok in doc]

df['tokens'] = df['doc'].apply(tokens)

In [None]:
params = {'logisticregression__C': [0.01, 0.1, 1.0],
         'countvectorizer__min_df':[1,2,5],
         'countvectorizer__max_df':[0.5,0.75]}
grid = GridSearchCV(lr, params, n_jobs=-1, cv=3)
grid.fit(df.loc[train]['tokens'], df.loc[train]['sentiment'])

In [None]:
grid.best_params_

In [None]:
lr.set_params(**grid.best_params_)
lr.fit(df.loc[train]['tokens'], df.loc[train]['sentiment'])
lr.score(df.loc[test]['tokens'], df.loc[test]['sentiment']) * 100.

Extract features (i.e., words) with the strongest sentiment associations

In [None]:
def print_top_feats(M, k=0):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['logisticregression'].coef_[0]
    order = coef.argsort()
    for w1, w2 in zip(order[-k:][::-1],order[:k]):
        print(f'{V[w1]:20s} {coef[w1]:6.3f} | {V[w2]:20s} {coef[w2]:6.3f}')

In [None]:
print_top_feats(lr, 20)

Check out a review that the model gets wrong

In [None]:
df.loc[8181]['sentiment'], lr.predict([df.loc[8181]['tokens']])[0]

In [None]:
df.loc[8181]['reviewText']

In [None]:
print(sorted(set(df.loc[8181]['tokens'])))

## Handling negation

In [None]:
s1 = nlp('I would never recommend this product to anyone.')
displacy.render(s1, style='dep', jupyter=True)

Tag tokens that are modified by negative words (e.g., *not*, *never*)

In [None]:
Token.set_extension('neg', default=False, force=True)

In [None]:
for doc in df['doc']:
    for tok in doc:
        if tok.dep_ == 'neg':
            tok.head._.neg = True

In [None]:
def negtokens(doc):
    return ['NOT:'+tok.lower_ if tok._.neg else tok.lower_ for tok in doc]
df['negtokens'] = df['doc'].apply(negtokens)

In [None]:
print(df.loc[8181]['negtokens'])

In [None]:
lr.fit(df.loc[train]['negtokens'], df.loc[train]['sentiment'])
lr.score(df.loc[test]['negtokens'], df.loc[test]['sentiment']) * 100.

In [None]:
lr.predict([df.loc[8181]['negtokens']])[0]

In [None]:
print_top_feats(lr, 20)

## Adding other modifiers (adjectives, adverbs)

In [None]:
displacy.render(df.loc[8181]['doc'], style='dep', jupyter=True)

In [None]:
def negify(tok):
    return 'NOT:' + tok.lower_ if tok._.neg else tok.lower_

def mods(doc):
    return [ negify(w.head) + '_' + negify(w) for w in doc 
            if w.dep_ in ['amod', 'advmod'] ] + \
            [ negify(w) for w in doc]

df['mods'] = df['doc'].apply(mods)

In [None]:
print(df.loc[8181]['mods'])

In [None]:
lr.fit(df.loc[train]['mods'], df.loc[train]['sentiment'])
lr.score(df.loc[test]['mods'], df.loc[test]['sentiment']) * 100.

In [None]:
print_top_feats(lr, 25)

## Add in **all** dependency pairs

In [None]:
def everything(doc):
    return [ negify(w.head) + '_' + negify(w) for w in doc  
            if w.head != w ] + \
            [ negify(w) for w in doc]

df['everything'] = df['doc'].apply(everything)

In [None]:
lr.fit(df.loc[train]['everything'], df.loc[train]['sentiment'])
lr.score(df.loc[test]['everything'], df.loc[test]['sentiment']) * 100.

In [None]:
print_top_feats(lr, 25)