In [1]:
import os
import tqdm
from sklearn.datasets import load_files

In [2]:
PATH_TO_DATA = 'imdb_reviews'

In [3]:
!du -hs $PATH_TO_DATA

246M	imdb_reviews


In [4]:
!du -hs $PATH_TO_DATA/train
!du -hs $PATH_TO_DATA/test

123M	imdb_reviews/train
122M	imdb_reviews/test


In [5]:
%%time
train_reviews = load_files(os.path.join(PATH_TO_DATA, 'train'))

CPU times: user 1.9 s, sys: 2.03 s, total: 3.93 s
Wall time: 5min 12s


In [6]:
%%time
test_reviews = load_files(os.path.join(PATH_TO_DATA, 'test'))

CPU times: user 1.12 s, sys: 930 ms, total: 2.05 s
Wall time: 1min 47s


In [7]:
type(train_reviews), len(train_reviews.data)

(sklearn.utils.Bunch, 25000)

In [8]:
train_reviews.data[0]

b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [9]:
train_reviews.target[0]

1

In [10]:
train_reviews.data[1]

b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decision" should without a doubt be you\'re choice over this one, even the "Turbulenc

In [11]:
train_reviews.target[1]

0

BOW - Bag of words

In [12]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [13]:
a = np.zeros([5, 5])
a[0, 3] = 1
a[4, 4] = 6
a[2, 2] = 5
a[3, 1] = 4
a[3, 2] = 2
a[1, 1] = 7
a

array([[0., 0., 0., 1., 0.],
       [0., 7., 0., 0., 0.],
       [0., 0., 5., 0., 0.],
       [0., 4., 2., 0., 0.],
       [0., 0., 0., 0., 6.]])

In [14]:
pd.DataFrame(a, columns=['apple', 'wax', 'sadness', 'luck', 'girl'])

Unnamed: 0,apple,wax,sadness,luck,girl
0,0.0,0.0,0.0,1.0,0.0
1,0.0,7.0,0.0,0.0,0.0
2,0.0,0.0,5.0,0.0,0.0
3,0.0,4.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,6.0


In [15]:
b = csr_matrix(a)
b

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [16]:
b.todense()

matrix([[0., 0., 0., 1., 0.],
        [0., 7., 0., 0., 0.],
        [0., 0., 5., 0., 0.],
        [0., 4., 2., 0., 0.],
        [0., 0., 0., 0., 6.]])

In [17]:
b.nonzero()

(array([0, 1, 2, 3, 3, 4], dtype=int32),
 array([3, 1, 2, 1, 2, 4], dtype=int32))

In [18]:
b.data

array([1., 7., 5., 4., 2., 6.])

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
%%time
cv = CountVectorizer(ngram_range=(1, 2))
X_train_sparse = cv.fit_transform(train_reviews.data)

CPU times: user 33 s, sys: 1.22 s, total: 34.3 s
Wall time: 37.7 s


In [46]:
len(cv.vocabulary_)

1513832

In [47]:
%%time
X_test_sparse = cv.transform(test_reviews.data)

CPU times: user 17.7 s, sys: 87.4 ms, total: 17.8 s
Wall time: 18.2 s


In [48]:
X_train_sparse.shape, X_test_sparse.shape

((25000, 1513832), (25000, 1513832))

In [24]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score

In [25]:
y_train, y_test = train_reviews.target, test_reviews.target

In [26]:
np.bincount(y_train), np.bincount(y_test)

(array([12500, 12500]), array([12500, 12500]))

In [29]:
10 ** 6 / X_train_sparse.shape[0]

40.0

In [40]:
logit = LogisticRegression(random_state=17, n_jobs=2, solver='lbfgs')
sgd_logit = SGDClassifier(max_iter=40, random_state=17, n_jobs=2, tol=1e-3)

In [51]:
%%time
logit.fit(X_train_sparse, y_train)

CPU times: user 158 ms, sys: 310 ms, total: 468 ms
Wall time: 2min 7s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=2,
          penalty='l2', random_state=17, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [49]:
%%time
sgd_logit.fit(X_train_sparse, y_train)

CPU times: user 4.07 s, sys: 47.1 ms, total: 4.11 s
Wall time: 4.21 s


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=40,
       n_iter=None, n_iter_no_change=5, n_jobs=2, penalty='l2',
       power_t=0.5, random_state=17, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [52]:
accuracy_score(y_test, logit.predict(X_test_sparse))

0.8964

In [50]:
accuracy_score(y_test, sgd_logit.predict(X_test_sparse))

0.88228