# DS-SF-30 | Codealong 18: Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''
#click on book and download that one specifically in the window that appears
pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem #these are new

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words aka very common words that don't add value
    #if we remove this line, we'll get every word back
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens
#if you have an empty string, return false

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [15]:
#need to add some stemming. we're gonna stem: ['sentence', 'wait', 'another', 'third']
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [16]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [32]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [33]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2016-12-11,R3SH1N77GNTD9K,Stefi,Great read,Very moving story. Great effortless writing wh...,5.0
1,2016-12-11,RVOEQK3JK4LY2,Amazon Customer,Great book! Does not disappoint,Great book! Does not disappoint. Wonderful c...,5.0
2,2016-12-11,RCU7OTNRDJBOS,Priscilla Seaton,Disturbing in its accurate reflection of human...,A very absorbing book. Not at all what I expec...,4.0
3,2016-12-10,R257OLQTPXYQ82,J,Superb,"Lives intertwined, humor,sadness, superior sto...",5.0
4,2016-12-10,R1LNKO30KAXCUM,Roberta L. Sherrill,One Star,Disappointing..... finally quit reading it. S...,1.0
...,...,...,...,...,...,...
5796,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5798,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5799,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [None]:
df.body[0]

In [34]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [35]:
df

Unnamed: 0,body,star_rating
0,Very moving story. Great effortless writing wh...,5.0
1,Great book! Does not disappoint. Wonderful c...,5.0
2,A very absorbing book. Not at all what I expec...,4.0
3,"Lives intertwined, humor,sadness, superior sto...",5.0
4,Disappointing..... finally quit reading it. S...,1.0
...,...,...
5796,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,The depth of character development and storyli...,5.0
5798,The book was great and I will love to re-read ...,5.0
5799,I started to order the kindle edition and than...,5.0


### `NaN`

In [36]:
df.isnull().sum()

body           3
star_rating    0
dtype: int64

In [37]:
df.dropna(inplace = True)
#let's get rid of na

In [38]:
df.star_rating.value_counts()
#we have class imbalance here so algo will predict more frequent class

5.0    1497
1.0    1184
4.0    1178
2.0     972
3.0     967
Name: star_rating, dtype: int64

### Positive, neutral, and negatives reviews

In [41]:
# make a dictionary first to convert star ratings to our new stystem
df['polarity'] = df.star_rating.map({1: -1 , 2: -1 , 3: 0, 4: 1 ,5: 1 })
df['polarity']

0       1
1       1
2       1
3       1
4      -1
       ..
5796   -1
5797    1
5798    1
5799    1
5800   -1
Name: polarity, dtype: int64

### Feature matrix and response vector

In [47]:
# TODO
df.polarity.value_counts()
#here we see class imbalance. so how do we fix it? one method is upsampling to get up to 2675
#alternatively, we can randomply sample  967 out from the 1 group and from the -1 group
ns = df.polarity.value_counts()


In [48]:
ns.min()
#find how many you want ot remove

967

In [50]:
for polarity in [-1, 0, 1]:
    n = ns[polarity] - ns.min()
    index = df[df.polarity == polarity].sample(n = n, random_state = 0).index 
    #we do .index because
    df.drop(index, inplace = True)

In [52]:
df.polarity.value_counts()
#now we see the counts are equal

 1    967
-1    967
 0    967
Name: polarity, dtype: int64

In [58]:
X = df.body
c = df.polarity

In [59]:
X

0       Very moving story. Great effortless writing wh...
3       Lives intertwined, humor,sadness, superior sto...
4       Disappointing..... finally quit reading it.  S...
9       I feel as though JK Rowling was trying a bit t...
10      Very slow reading and depressing. The book doe...
                              ...                        
5784    Thanks a lot, folks. I looked at the reviews t...
5787    excellent character portrayal,insightful,and a...
5793    I'm confused by people who complain about kind...
5795    When I pay more the $10.00 for a book it had b...
5798    The book was great and I will love to re-read ...
Name: body, dtype: object

In [60]:
#we need ot vectorize the text
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

### Train/test sets

In [61]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

### TF-IDF and `TfidfVectorizer`

In [68]:
# TODO
#we need to do some stemming too
#vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

#we're gonna apply stemmer to every possible document
class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()
        
    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return 
    
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1,3), min_df = 3)


#need to make 1,2, and 3 gram combinations
#we have a minimum threshold of 3 occurences required

In [66]:
vectorizer.fit(train_X)

TypeError: 'NoneType' object is not iterable

In [67]:
vectorizer.get_feature_names()

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

### Bag-of-words

In [None]:
vectorizer.get_feature_names()

### Transformed feature matrix `X`

In [69]:
# remember fit for min/max scaling
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

### Machine Learning Modeling

> # TODO...

In [70]:
#let's do a Logistic regresion model fit
model = linear_model.LogisticRegression()


In [71]:
cross_validation.cross_val_score(model, train_X, train_c, cv = 5).mean()

ValueError: could not convert string to float: Started very slow almost gave up several times. Characters as well developed, so much so that you know them almost personally by the end.

In [None]:
train_c_hat = cross_validation.cross_val_predict(model, train_X, train_c, cv = 5)

In [72]:
metrics.accuracey_score(train_c, train_c_hat)

AttributeError: 'module' object has no attribute 'accuracey_score'

In [None]:
#interpretting the resulting table: As you move form -1 to pos 1, your error goes down to 0

In [73]:
model.fit(train_X, train_c)

ValueError: could not convert string to float: Started very slow almost gave up several times. Characters as well developed, so much so that you know them almost personally by the end.

In [None]:
model.score(test_X, test_c) #answer should be 60. We overfit a little tiny bit, compare this to 89 we sar previously

In [None]:
model = ensemble.RandomForestClassifier(n_estimaros = 100) #we're building 100 trees per random forest --> 
#this is not deterministic 


In [None]:
#even with random forest, we overfit. We created a lot of features
#feature engineering is the key here. random forrests assumes distributions whereas logsitic regression determines some sort of
#lineartiy