# DS-SF-34 | 18 | Natural Language Processing | Codelong | Starter Code

## >>> One-time setup

In [1]:
# '''
# import nltk
# nltk.download()
# '''

# pass

## <<< One-time setup

## Part A | Tokenization and Stemming

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, model_selection, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
len(corpus.stopwords.words('english'))

153

In [14]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [15]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [20]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [21]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Part B | Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)  We scrapped this dataset during class 3.

In [52]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [53]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2017-04-21,R3TUANQ2EB3ECB,MichaelMichaels,Skip it. Life is too short.,I've never read any of the Harry Potter books ...,1
1,2017-04-20,R2DD03ZZ4218VW,Frans van Wyk,Four Stars,Excellent Read with a lot of real life values ...,4
2,2017-04-20,R296NVKLH5QS4W,Sabina Duke,Characters,Hard to keep the characters straight,4
3,2017-04-05,R3MP7W8LH6VHU8,Jen Blau,GIVE IT A CHANCE!,I almost put this book down. I'm new to Rowlin...,5
4,2017-04-04,RZWP48RKJCXT1,Lilith Eleanor,Frighteningly good,Amazing. Rowling combines fantastic writing wi...,5
...,...,...,...,...,...,...
5856,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1
5857,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5
5858,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5
5859,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5


In [54]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [55]:
df

Unnamed: 0,body,star_rating
0,I've never read any of the Harry Potter books ...,1
1,Excellent Read with a lot of real life values ...,4
2,Hard to keep the characters straight,4
3,I almost put this book down. I'm new to Rowlin...,5
4,Amazing. Rowling combines fantastic writing wi...,5
...,...,...
5856,Premise sounds dull as dirt. For $17 for a co...,1
5857,The depth of character development and storyli...,5
5858,The book was great and I will love to re-read ...,5
5859,I started to order the kindle edition and than...,5


### `NaN`

In [56]:
# TODO
df.isnull().sum()

body           3
star_rating    0
dtype: int64

In [57]:
df.dropna(inplace=True)

### Positive, neutral, and negatives reviews

In [58]:
# TODO
df['polarity'] = df.star_rating.map({1: -1, 2: -1, 3: 0, 4: 1, 5: 1})
ns = df.polarity.value_counts()
ns

 1    2711
-1    2177
 0     970
Name: polarity, dtype: int64

**Downsampling**: take the lowest count and make the others match (970 of each category)

In [59]:
for polarity in [-1,0,1]:
    n = ns[polarity] - ns.min()
    index = df[df.polarity == polarity].sample(n = n, random_state = 0).index
    df.drop(index, inplace = True)

**Upsampling**: similar but with replication of the lower represented classes
- if you do upsampling you must split first!!!

In [61]:
# TODO
df['polarity'] = df.star_rating.map({1: -1, 2: -1, 3: 0, 4: 1, 5: 1})
ns = df.polarity.value_counts()
ns

 1    970
-1    970
 0    970
Name: polarity, dtype: int64

### Feature matrix and response vector

In [62]:
# TODO
X = df['body']
c = df.polarity

### Train/test sets

In [63]:
train_X, test_X, train_c, test_c = model_selection.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)
#stratify: just in case you have something very imbalanced, this will keep the proportions between train and test 
#random_state: means that the results will be replicable each time you run it because it will use the same randomization
#if you do upsampling you must split first!!!

### TF-IDF and `TfidfVectorizer`
Term Frequency (TF) Inverse Document Frequency (IDF)

`tf(t,d) = number of occurences of term t in document d / number of terms in document d`

In [65]:
# TODO
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

In [72]:
class CustomTokenizer(object):
    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

In [73]:
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), )

In [78]:
vectorizer.fit(train_X)

### Bag-of-words

In [76]:
vectorizer.get_feature_names()

[u'012315',
 u'08',
 u'1',
 u'10',
 u'100',
 u'1000',
 u'1012',
 u'105',
 u'11',
 u'110',
 u'12',
 u'120',
 u'13',
 u'130',
 u'132',
 u'14',
 u'142',
 u'143',
 u'149',
 u'1495',
 u'1499',
 u'15',
 u'150',
 u'17',
 u'170',
 u'175',
 u'1799',
 u'18',
 u'1860',
 u'18th',
 u'1950',
 u'1960',
 u'1984',
 u'19th',
 u'1star',
 u'2',
 u'20',
 u'200',
 u'2012',
 u'2015',
 u'2016',
 u'21',
 u'21st',
 u'23',
 u'230am',
 u'236',
 u'24',
 u'25',
 u'250',
 u'27',
 u'28',
 u'2nd',
 u'3',
 u'30',
 u'300',
 u'3000',
 u'31',
 u'32',
 u'34',
 u'35',
 u'355',
 u'380',
 u'3d',
 u'3rd',
 u'4',
 u'40',
 u'400',
 u'40ish',
 u'412star',
 u'44',
 u'45',
 u'450',
 u'5',
 u'50',
 u'500',
 u'500th',
 u'503',
 u'505',
 u'50th',
 u'56',
 u'57',
 u'6',
 u'60',
 u'600',
 u'6080',
 u'62',
 u'6th',
 u'7',
 u'70',
 u'72',
 u'75',
 u'77',
 u'8',
 u'80',
 u'800',
 u'89',
 u'90',
 u'92',
 u'93',
 u'98',
 u'9997',
 u'aand',
 u'aback',
 u'abandon',
 u'abil',
 u'abject',
 u'abl',
 u'abnorm',
 u'abort',
 u'abound',
 u'aboutth',


In [77]:
len(vectorizer.get_feature_names())

4621

### Transformed feature matrix `X`

In [79]:
# TODO
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

In [83]:
train_X

<1746x4621 sparse matrix of type '<type 'numpy.float64'>'
	with 44401 stored elements in Compressed Sparse Row format>

In [130]:
train_X.shape

(1746, 4621)

In [None]:
#OVERFIT because too many degrees of freedom!!!! Too many variables to small number of samples!!!

In [82]:
train_X.todense() # to show a sparse matrix

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

### Machine Learning Modeling

## Random Forest

In [123]:
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

# Train uncalibrated random forest classifier on whole train and validation
# data and evaluate on test data
clf = RandomForestClassifier(n_estimators=400,)
clf.fit(train_X, train_c)

#

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [124]:
print "My training score is", clf.score(train_X, train_c)
print "My generalization score is", clf.score(test_X, test_c)

My training score is 0.997136311569
My generalization score is 0.550687285223


In [125]:
model_selection.cross_val_score(clf, train_X, train_c, cv = 10).mean()

0.58989869472043643

## Logistic Regression

In [126]:
model = linear_model.LogisticRegression(penalty='l2').fit(train_X, train_c)

In [127]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [128]:
print "My training score is", model.score(train_X, train_c)
print "My generalization score is", model.score(test_X, test_c)

My training score is 0.862542955326
My generalization score is 0.591065292096


In [89]:
c_hat = model.predict(test_X)

pd.crosstab(c_hat,
    test_c,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

True Class,-1,0,1
Hypothesized Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,255,98,36
0,72,163,82
1,61,127,270


In [129]:
model_selection.cross_val_score(model, train_X, train_c, cv = 10).mean()

0.60890317553087869