# DS-SF-30 | Codealong 18: Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''

pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import re
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')



## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [re.sub('java:\d+', '', token) for token in tokens]
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")
# x = '''
# Exception in thread "main" java.lang.NullPointerException
#         at com.example.myproject.Book.getTitle(Book.java:16)
#         at com.example.myproject.Author.getBookTitles(Author.java:25)
#         at com.example.myproject.Bootstrap.main(Bootstrap.java:14)
# '''
# tokens = tokenize_text(x)

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [5]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [6]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', 'wait', u'anoth', 'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [7]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [8]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2016-12-11,R3SH1N77GNTD9K,Stefi,Great read,Very moving story. Great effortless writing wh...,5.0
1,2016-12-11,RVOEQK3JK4LY2,Amazon Customer,Great book! Does not disappoint,Great book! Does not disappoint. Wonderful c...,5.0
2,2016-12-11,RCU7OTNRDJBOS,Priscilla Seaton,Disturbing in its accurate reflection of human...,A very absorbing book. Not at all what I expec...,4.0
3,2016-12-10,R257OLQTPXYQ82,J,Superb,"Lives intertwined, humor,sadness, superior sto...",5.0
4,2016-12-10,R1LNKO30KAXCUM,Roberta L. Sherrill,One Star,Disappointing..... finally quit reading it. S...,1.0
...,...,...,...,...,...,...
5796,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5798,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5799,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [9]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [10]:
df

Unnamed: 0,body,star_rating
0,Very moving story. Great effortless writing wh...,5.0
1,Great book! Does not disappoint. Wonderful c...,5.0
2,A very absorbing book. Not at all what I expec...,4.0
3,"Lives intertwined, humor,sadness, superior sto...",5.0
4,Disappointing..... finally quit reading it. S...,1.0
...,...,...
5796,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,The depth of character development and storyli...,5.0
5798,The book was great and I will love to re-read ...,5.0
5799,I started to order the kindle edition and than...,5.0


### `NaN`

In [11]:
df.dropna(inplace = True)
df['polarity'] = df.star_rating.apply(lambda rating: cmp(rating, 3))
print df.polarity.value_counts()

# positive = df[df.polarity == 1].sample(n = 967)
# negative = df[df.polarity == -1].sample(n = 967)
# neutral = df[df.polarity == 0]
# df = positive.append(negative).append(neutral)
ns = df.polarity.value_counts()

for polarity in [-1, 0, 1]:
    # how many to remove
    n = ns[polarity] - ns.min()
    # which ones to remove
    index = df[df.polarity == polarity].sample(n, random_state = 0).index
    # remove
    df.drop(index, inplace = True)

print df.polarity.value_counts()

 1    2675
-1    2156
 0     967
Name: polarity, dtype: int64
 1    967
-1    967
 0    967
Name: polarity, dtype: int64


### Positive, neutral, and negatives reviews

In [12]:
# TODO

### Feature matrix and response vector

In [13]:
# TODO

### Train/test sets

In [14]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

NameError: name 'X' is not defined

### TF-IDF and `TfidfVectorizer`

In [None]:
# TODO

### Bag-of-words

In [None]:
vectorizer.get_feature_names()

### Transformed feature matrix `X`

In [None]:
# TODO

### Machine Learning Modeling

> # TODO...