# Applying Machine Learning to Sentiment Analysis

Sentiment Analysis is a subfield of Natural Language Processing (NLP) to classify documents based on their polarity; the attitude of the writer.

In this example, we will be applying sentiment analysis to movie reviews.

In [1]:
import pyprind
import pandas as pd
import os

basepath = 'aclImdb'

labels = {'pos' : 1, 'neg' : 0}
pbar = pyprind.ProgBar(50000) # initialize progress bar with number of tasks
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path,file),
                     'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                          ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:16


Since the class labels are sorted, lets shuffle the index and save as a new csv file.

In [2]:
import numpy as np

np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head()

assert df.shape == (50000, 2)

## Bag of Words Model

The **bag-of-words** model allows us to represent text as numerical feature vectors. The basic idea is

1. Create a vocabulary of unique tokens (words from the entire set of documents)
2. Construct a feature vector for each document that contains the counts of how often each word occurs in a particular document

Let's walk through a simple example.

Note this example follows the *uni-gram* model, using each word as its own feature. This can be extended to any *n-gram* model. Practically, we would just change the way we initialize our CountVectorizer object.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining, the weather is sweet',
                'and one and one is two'])
bag = count.fit_transform(docs)
print (count.vocabulary_)
print (bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [0 2 0 1 1 1 2 0 1]
 [2 1 2 0 0 0 0 1 0]]


## Assessing Word Relevancy via Frequency-Inverse Document Frequency

Frequently occurring words typically don't contain useful or discriminatory information.

Scikit-learn has another transformer implementation that takes the CountVectorizer inputs and transforms them into **term frequency-inverse document frequency (tf-idf)**.

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True,
                        norm='l2', # divides each feature vector by its l2 norm vector
                        smooth_idf=True)
np.set_printoptions(precision=2)
print (tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.38 0.   0.57 0.57 0.   0.46 0.   0.  ]
 [0.   0.38 0.   0.   0.   0.57 0.46 0.   0.57]
 [0.   0.46 0.   0.35 0.35 0.35 0.56 0.   0.35]
 [0.66 0.17 0.66 0.   0.   0.   0.   0.33 0.  ]]


## Cleaning Text Data

A critical step before implementing bag-of-words or any other model is stripping our text of unwanted characters.

We will use Python's regular expressions library in this simple example.

In [5]:
import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

df['review'] = df['review'].apply(preprocessor)

## Processing Documents into Tokens

In [6]:
# tokenizer
def tokenizer(text):
    return text.split()

# stemming - transforming word into its root form
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [7]:
# stop words removal
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/zach/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Training A Logistic Regression Model for Document Classification

In [8]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [10]:
# using grid search to find optimal hyperparameters

# model imports
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

param_grid = [{'vect__ngram_range' : [(1,1)],
              'vect__stop_words' : [stop, None],
              'vect__tokenizer' : [tokenizer, tokenizer_porter],
              'clf__penalty' : ['l1', 'l2'],
              'clf__C' : [1.0, 10.0, 100.0]},
             {'vect__ngram_range' : [(1,1)],
              'vect__stop_words' : [stop, None],
              'vect__tokenizer' : [tokenizer, tokenizer_porter],
              'vect__use_idf' : [False],
              'vect__norm' : [None],
              'clf__penalty' : ['l1', 'l2'],
              'clf__C' : [1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(random_state=0))])

In [13]:
# send it
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy',
                          cv=5,
                          verbose=1,
                          n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 33.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 171.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 215.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [14]:
# reporting results
print ('Best parameter set: %s' % gs_lr_tfidf.best_params_)
print ('Cross fold validation accuracy: %.3f' % gs_lr_tfidf.best_score_)
print ('Testing accuracy: %.3f' % gs_lr_tfidf.score(X_test, y_test))

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x10fb17ea0>}
Cross fold validation accuracy: 0.897
Testing accuracy: 0.899


## Working with Bigger Data - Online Algorithms and Out of Core Learning

Out of Core Learning allows us to work with larger data by fitting the classifier in incremental batches.

In [16]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return [w for w in text.split() if w not in stop]

# generator function that reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header line
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [20]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21, # note this also increases model coefficients
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log',
                   random_state=1)
doc_stream = stream_docs('movie_data.csv')

In [21]:
# training the model using out of core learning
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:25


In [22]:
# evaluate accuracy
X_test, y_test = get_minibatch(doc_stream, size=1000)
X_test = vect.transform(X_test)
print ('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.879
