In [3]:
import pyprind
import pandas as pd
import numpy as np
import os

In [4]:
pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = f'../data/IMDb/aclImdb/{s}/{l}'
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
    
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:15


In [5]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('../data/IMDb/movie_data.csv', index=False)

In [6]:
df = pd.read_csv('../data/IMDb/movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


## Bag of Words (CountVectorizer example)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet, and one and one is two',
])
bag = vectorizer.fit_transform(docs)

In [9]:
vectorizer.vocabulary_

{'the': 6,
 'sun': 4,
 'is': 1,
 'shining': 3,
 'weather': 8,
 'sweet': 5,
 'and': 0,
 'one': 2,
 'two': 7}

In [10]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [3, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

### Term Frequency - Inverse Document Frequency (TfIdf example)

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

In [12]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
tfidf.fit_transform(vectorizer.fit_transform(docs)).toarray()

array([[0.  , 0.43, 0.  , 0.56, 0.56, 0.  , 0.43, 0.  , 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.  , 0.56, 0.43, 0.  , 0.56],
       [0.66, 0.39, 0.44, 0.17, 0.17, 0.17, 0.26, 0.22, 0.17]])

### Data cleaning

In [13]:
df.loc[3, 'review'][-50:]

". <br /><br />This is one I'd recommend to anyone."

In [14]:
import re

In [15]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [16]:
preprocessor(df.loc[3, 'review'][-50:])

' this is one i d recommend to anyone '

In [17]:
preprocessor('</a>This :) is :( a test :-)!')

'this is a test :) :( :)'

In [18]:
df['review'] = df['review'].apply(preprocessor)

### Tokenizer

In [19]:
from nltk.stem.porter import PorterStemmer

In [20]:
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def porter_tokenizer(text):
    return [porter.stem(word) for word in text.split()]

In [21]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [22]:
porter_tokenizer('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

### Stop words

In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/tbb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
from nltk.corpus import stopwords

In [25]:
stop = stopwords.words('english')
[word for word in porter_tokenizer('runners like running and thus they run')
 if word not in stop]

['runner', 'like', 'run', 'thu', 'run']

### Classification

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [36]:
tfidf?

In [38]:
from tokenizer import tokenizer, porter_tokenizer

In [39]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, porter_tokenizer],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
    },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, porter_tokenizer],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
    }
]

lr_tfidf_pipe = Pipeline([('vect', tfidf),
                          ('clf', LogisticRegression(random_state=11))])
gs_lr_tfidf = GridSearchCV(lr_tfidf_pipe, param_grid,
                           scoring='accuracy', cv=5,
                           verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 57.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 248.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 317.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,..., penalty='l2', random_state=11, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [43]:
f'Best param set: {gs_lr_tfidf.best_params_}'

"Best param set: {'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function porter_tokenizer at 0x12a9577b8>}"

In [44]:
f'Best param set: {gs_lr_tfidf.best_params_}'

"Best param set: {'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function porter_tokenizer at 0x12a9577b8>}"

In [42]:
gs_lr_tfidf

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,..., penalty='l2', random_state=11, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [45]:
f'Cross-validation accuracy: {gs_lr_tfidf.best_score_:.3f}'

'Cross-validation accuracy: 0.876'

In [46]:
clf = gs_lr_tfidf.best_estimator_
f'Hold-out set score: {clf.score(X_test, y_test):.2f}'

'Hold-out set score: 0.88'