In [1]:
import pyprind
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords

In [2]:
basepath = 'Data_IMDB/'
labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

# Assemble the individual text documents into 
# a single CSV file
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:02:01


In [3]:
df.columns = ['review', 'sentiment']

In [4]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [5]:
# Confirming that we have saved the data in the right format
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [6]:
# Sample Data
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [7]:
# Preprocessing the data using Regex
def preprocessor(text):
    
    # Removing all the HTML mark up from the movies reviews
    text = re.sub('<[^>]*', '', text)
    
    # Regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    # Remove all non-word characters from the text
    # & convert the text to lowercase
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    
    return(text)

In [8]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [9]:
preprocessor("</a>This :) is :( a test :-)!")

' this is a test :) :( :)'

In [10]:
# Applying preprocessor to all movie reviews in our DataFrame
df['review'] = df['review'].apply(preprocessor)

In [11]:
stop = stopwords.words('english')

def tokenizer(text):
    
    # Removing all the HTML mark up from the movies reviews
    text = re.sub('<[^>]*', '', text)
    
    # Regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    # Remove all non-word characters from the text
    # & convert the text to lowercase
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    
    tokenized = [w for w in text.split() if w not in stop]
    
    return(tokenized)

In [12]:
# Generator function, that reads in & returns 1 document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        
        # Skip header
        next(csv)
        
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield(text, label)

In [13]:
# Read in the first document from movie_data.csv
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [14]:
# Method that takes a document stream from stream_docs & returns
# a particular number of documents specified by size
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return(None, None)
    
    return(docs, y)

In [15]:
# Since CountVectorizer & TfidfVectorizer require to hold vocab in memory,
# we will use HashingVectorizer

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error = 'ignore',
                         n_features = 2**21,
                         preprocessor = None,
                         tokenizer = tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

doc_stream = stream_docs(path='movie_data.csv')

In [16]:
#Initialize progress bar object with 45 iterations
pbar  = pyprind.ProgBar(45)

classes = np.array([0,1])

# Iterate over 45 mini-batches of documents, with each mini-batch having
# 1000 documents
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:30


In [17]:
# Use the last 5000 documents to evaluate performance
X_test, y_test = get_minibatch(doc_stream, size=5000)

X_test = vect.transform(X_test)

print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.867
