In [72]:
import pyprind
import pandas as pd
import os
import io

basepath = './data/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            # For python2, use 'io.open', for Python3, just us 'open' 
            with io.open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:36


In [73]:
X = df.loc[:,:].values
X.shape

(50000, 2)

In [103]:
import numpy as np
np.random.seed(0)

df = df.sample(frac=1).reset_index(drop=True)
df.to_csv('./movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('./movie_data.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,it s hard to put your finger on this one basic...,1
1,i mean nothing happens 5 dumb kids go to oklah...,0
2,så som in himmelen was probably one of the 3 ...,1
3,this noir may not be the best remembered film ...,1
4,i ll be honest with yall i was a junior in hig...,1


In [104]:
# Preview the data
df.loc[0, 'review'][-500:]

'ho falls in love with a comparatively poor girl whom he wants to marry at the risk of being disowned by his family it has funny moments romantic moments and touching moments dudley moore is funny and somehow makes his self centred character endearing liza minelli is a convincing foil as the the feisty opposite he attracts but john gielgud steals the show as arthur s wonderfully sarcastic butler it s corny but great fun with a memorable soundtrack and ran for nearly 3 months at our local fleapit '

In [105]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [106]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    ]

In [107]:
bag = vectorizer.fit_transform(corpus)
bag.shape, bag

((4, 9), <4x9 sparse matrix of type '<class 'numpy.int64'>'
 	with 19 stored elements in Compressed Sparse Row format>)

In [108]:
bag.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [109]:
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [110]:
vectorizer.vocabulary_.get('first')

2

In [111]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [48]:
# Tf-idf with a corpus document
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(smooth_idf=False)
np.set_printoptions(precision=2)
tfidf.fit_transform(vectorizer.fit_transform(corpus)).toarray()

array([[0.  , 0.43, 0.57, 0.43, 0.  , 0.  , 0.34, 0.  , 0.43],
       [0.  , 0.24, 0.  , 0.24, 0.  , 0.89, 0.19, 0.  , 0.24],
       [0.56, 0.  , 0.  , 0.  , 0.56, 0.  , 0.24, 0.56, 0.  ],
       [0.  , 0.43, 0.57, 0.43, 0.  , 0.  , 0.34, 0.  , 0.43]])

In [49]:
# The weights of each feature computed by the fit method call are stored in a model attribute
tfidf.idf_

array([2.39, 1.29, 1.69, 1.29, 2.39, 2.39, 1.  , 2.39, 1.29])

In [50]:
# Tf-idf with a counts example
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [51]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [52]:
tfidf.toarray()

array([[0.82, 0.  , 0.57],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.47, 0.88, 0.  ],
       [0.58, 0.  , 0.81]])

In [53]:
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()

array([[0.85, 0.  , 0.52],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.55, 0.83, 0.  ],
       [0.63, 0.  , 0.78]])

In [54]:
transformer.idf_

array([1.  , 2.25, 1.85])

In [112]:
# regex
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [113]:
preprocessor(df.loc[0, 'review'][-200:])

'ty opposite he attracts but john gielgud steals the show as arthur s wonderfully sarcastic butler it s corny but great fun with a memorable soundtrack and ran for nearly 3 months at our local fleapit '

In [114]:
# apply the preprocess function to all reviews
df['review'] = df['review'].apply(preprocessor)

In [115]:
def tokenizer(text):
    return text.split()

tokenizer('running like running and thus they run')

['running', 'like', 'running', 'and', 'thus', 'they', 'run']

In [116]:
# Stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('running like running and thus they run')

['run', 'like', 'run', 'and', 'thu', 'they', 'run']

In [117]:
# Stop-word removal
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [118]:
import io
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with io.open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [119]:
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

if Version(sklearn_version) < '0.18':
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=1e-3)
else:
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=1e-3)

    
doc_stream = stream_docs(path='movie_data.csv')
stop = stopwords.words('english')

In [120]:
X_train = df.loc[0:25000, 'review'].values
y_train = df.loc[0:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [121]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [122]:
# Vectorizer
vect = HashingVectorizer(decode_error='ignore', 
                       n_features=2**21,
                       preprocessor=None, 
                       tokenizer=tokenizer)

In [123]:
clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=1e-3)
doc_stream = stream_docs(path='./movie_data.csv')

In [124]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:22


In [125]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)

print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.884


In [69]:
import pickle
import os
from nltk.corpus import stopwords 

# Import stopwords
stop = stopwords.words('english')
# Create directory for pickle objects
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
  os.makedirs(dest)

In [70]:
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'),protocol = 4)

In [71]:
pickle.dump(clf, open(os.path.join(dest, 'classifier.plk'), 'wb'), protocol=4)