# DS-SF-27 | Codealong 16 | Introduction to Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''

pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [4]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [5]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [6]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [7]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [59]:
df = pd.read_csv(os.path.join('..', 'datasets', 'reviews_Books_5-0316228532.csv'))

In [60]:
df

Unnamed: 0,asin,reviewer_id,reviewer_name,summary,review_text,overall,review_time,unix_review_time,helpful
0,316228532,AY2UIGHCB4VPB,,but a good read!,"A departure for her, but a good read!",5,"07 12, 2014",1405123200,"[0, 0]"
1,316228532,A2L17U0TWH9UWS,1075,Not worth the time,I had a hard time remembering who each charact...,2,"11 12, 2013",1384214400,"[0, 1]"
2,316228532,A2R63TBVG5OAF6,12121,The Casual Vacancy,This is the only review I have ever written. ...,1,"10 1, 2012",1349049600,"[13, 25]"
3,316228532,ACU39L9G696US,123esmo,Expecting more from J.K. Rowling,"I was expecting more from J.K. Rowling, it's a...",2,"01 10, 2013",1357776000,"[0, 1]"
4,316228532,A3N7KY1PBMF880,&#34;Bad Cat!&#34;,Sorry That I Bought It.,As big a fan as I am of J K Rowling's Harry Po...,1,"05 11, 2013",1368230400,"[0, 3]"
...,...,...,...,...,...,...,...,...,...
2045,316228532,A1SCYWLS37YR50,ZC,Spectacular prose in a rambling story,Spectacular prose in a rambling story that see...,5,"02 12, 2014",1392163200,"[1, 1]"
2046,316228532,A1POFVVXUZR3IQ,Z Hayes,"Difficult to get into, but has its moments",Although I am a great fan of the Harry Potter ...,3,"07 18, 2013",1374105600,"[1, 1]"
2047,316228532,A1YSU2VSUJZAR5,zolteg59,The Casual Vacancy,"While the story was intriguing, and I am a hug...",1,"11 11, 2012",1352592000,"[0, 1]"
2048,316228532,A2ZF888HX9YR8E,Zoobeefoo,A better read for Brits perhaps?,What an odd book! The adolescent characters a...,3,"12 30, 2012",1356825600,"[2, 3]"


In [61]:
df.drop(['asin', 'reviewer_id', 'reviewer_name', 'summary', 'review_time', 'unix_review_time', 'helpful'],
    axis = 1,
    inplace = True)

In [62]:
df

Unnamed: 0,review_text,overall
0,"A departure for her, but a good read!",5
1,I had a hard time remembering who each charact...,2
2,This is the only review I have ever written. ...,1
3,"I was expecting more from J.K. Rowling, it's a...",2
4,As big a fan as I am of J K Rowling's Harry Po...,1
...,...,...
2045,Spectacular prose in a rambling story that see...,5
2046,Although I am a great fan of the Harry Potter ...,3
2047,"While the story was intriguing, and I am a hug...",1
2048,What an odd book! The adolescent characters a...,3


In [63]:
df.overall.value_counts(dropna = False)

4    464
5    457
3    397
2    373
1    359
Name: overall, dtype: int64

In [64]:
df.isnull().sum()

review_text    0
overall        0
dtype: int64

In [65]:
X = df.review_text
c = df.overall.map({1:-1, 2:-1, 3:0, 4:1, 5:1})

## Train/test sets

In [66]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, train_size = .6, random_state = 0)

## TF-IDF and `TfidfVectorizer`

In [40]:
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

## Bag-of-words

In [41]:
vectorizer.get_feature_names()

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [69]:
vectorizer.fit(train_X)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.CustomTokenizer object at 0x1192ecd90>,
        use_idf=True, vocabulary=None)

In [70]:
vectorizer.get_feature_names()

[u'1',
 u'1 star',
 u'10',
 u'100',
 u'100 page',
 u'12',
 u'12 star',
 u'13',
 u'14',
 u'15',
 u'150',
 u'150 page',
 u'16',
 u'1799',
 u'18',
 u'1984',
 u'19th',
 u'1star',
 u'1star review',
 u'2',
 u'2 star',
 u'20',
 u'200',
 u'200 page',
 u'2012',
 u'23',
 u'25',
 u'3',
 u'3 star',
 u'30',
 u'300',
 u'300 page',
 u'34',
 u'34 34',
 u'34 adult',
 u'34 adult 34',
 u'34 bad',
 u'34 book',
 u'34 casual',
 u'34 casual vacanc',
 u'34 charact',
 u'34 enjoy',
 u'34 f',
 u'34 f 34',
 u'34 get',
 u'34 good',
 u'34 harri',
 u'34 harri potter',
 u'34 novel',
 u'34 plot',
 u'34 real',
 u'34 town',
 u'34 word',
 u'35',
 u'35 star',
 u'3rd',
 u'4',
 u'4 5',
 u'4 5 star',
 u'4 letter',
 u'4 letter word',
 u'4 star',
 u'40',
 u'400',
 u'400 page',
 u'45',
 u'5',
 u'5 star',
 u'50',
 u'50 page',
 u'500',
 u'500 page',
 u'503',
 u'503 page',
 u'512',
 u'6',
 u'60',
 u'7',
 u'70',
 u'8',
 u'80',
 u'8211',
 u'8217',
 u'8217 one',
 u'8217 read',
 u'8220',
 u'8221',
 u'abandon',
 u'abbey',
 u'abil',
 u'

## Transform the feature matrix `X`

In [71]:
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

In [72]:
train_X

<1230x6272 sparse matrix of type '<type 'numpy.float64'>'
	with 79917 stored elements in Compressed Sparse Row format>

In [73]:
train_X.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.06615307],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.23061146, ...,  0.        ,
          0.        ,  0.        ]])

In [74]:
model = linear_model.LogisticRegression().\
    fit(train_X, train_c)

In [52]:
model = ensemble.RandomForestClassifier(n_estimators = 100)

In [48]:
model.score(train_X, train_c)

0.80894308943089432

In [75]:
cross_validation.cross_val_score(model, train_X, train_c,cv = 5).mean()

0.67479799751585789

In [76]:
train_c.value_counts()

 1    548
-1    438
 0    244
Name: overall, dtype: int64

In [77]:
model.score(test_X, test_c)

0.698780487804878

In [54]:
train_c_hat = cross_validation.cross_val_predict(model, train_X, train_c,cv = 5)

In [78]:
metrics.accuracy_score(train_c,train_c_hat)

0.64552845528455283

In [79]:
pd.crosstab(train_c_hat, train_c, rownames = ['Predicted'], colnames = ['True'])

True,-1,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,305,99,62
0,1,4,1
1,132,141,485


In [None]:
# Random Forests

In [67]:
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()

    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens
    
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1, 3), min_df = 3)

## Machine Learning Modeling

> # TODO...