## Movie Review Classifier
### (Natural Languate Processing and Machine Learning)
#### Large Movie Review Dataset v1.0

Collected by Andrew Maas from Stanford.<br>
http://ai.stanford.edu/~amaas/data/sentiment/index.html

50,000 IMDB movie review texts<br>
Binary classification problem: positve, negative<br>
25,000 highly polar movie reviews for training<br>
25,000 for testing.<br>

In [15]:
!tree -L 2 -C HowToBuild/aclImdb/

[01;34mHowToBuild/aclImdb/[00m
├── README
├── imdb.vocab
├── imdbEr.txt
├── [01;34mtest[00m
│   ├── labeledBow.feat
│   ├── [01;34mneg[00m
│   ├── [01;34mpos[00m
│   ├── urls_neg.txt
│   └── urls_pos.txt
└── [01;34mtrain[00m
    ├── labeledBow.feat
    ├── [01;34mneg[00m
    ├── [01;34mpos[00m
    ├── [01;34munsup[00m
    ├── unsupBow.feat
    ├── urls_neg.txt
    ├── urls_pos.txt
    └── urls_unsup.txt

7 directories, 11 files


### Load files into a training data set

In [32]:
from sklearn.datasets import load_files
review_train = load_files('HowToBuild/aclImdb/train/', 
                          categories=['neg','pos'], shuffle=False)
type(review_train)

sklearn.utils.Bunch

In [33]:
review_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [34]:
print(len(review_train.data))
print(review_train.target.shape)

25000
(25000,)


In [35]:
print(review_train.filenames[:10])

['HowToBuild/aclImdb/train/neg/0_3.txt'
 'HowToBuild/aclImdb/train/neg/10000_4.txt'
 'HowToBuild/aclImdb/train/neg/10001_4.txt'
 'HowToBuild/aclImdb/train/neg/10002_1.txt'
 'HowToBuild/aclImdb/train/neg/10003_1.txt'
 'HowToBuild/aclImdb/train/neg/10004_3.txt'
 'HowToBuild/aclImdb/train/neg/10005_3.txt'
 'HowToBuild/aclImdb/train/neg/10006_4.txt'
 'HowToBuild/aclImdb/train/neg/10007_1.txt'
 'HowToBuild/aclImdb/train/neg/10008_2.txt']


In [41]:
X, y = review_train.data, review_train.target
#print(len(X), y.shape)
X = [txt.replace(b'<br />',b'') for txt in X]
#print(X[10])
print(y[-10:])

[1 1 1 1 1 1 1 1 1 1]


In [44]:
type(X), type(y)

(list, numpy.ndarray)

### Convert Texts into numerical arrays (Bag of Words)

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
vocab = CountVectorizer()
vocab.fit(X)
x = vocab.transform(X)
type(x)

scipy.sparse.csr.csr_matrix

In [47]:
x.shape

(25000, 75911)

In [89]:
x[1,3]

0

In [50]:
print(vocab.get_feature_names()[:10])
print(vocab.get_feature_names()[-10:])
print(vocab.get_feature_names()[35000:35010])

['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007']
['était', 'état', 'étc', 'évery', 'êxtase', 'ís', 'ísnt', 'østbye', 'über', 'üvegtigris']
['ireland', 'irena', 'irene', 'iridescent', 'irina', 'iris', 'irises', 'irish', 'irishman', 'irishmen']


In [52]:
import numpy as np 
word_total_counts = np.sum(x, axis=0)
type(word_total_counts)

numpy.matrixlib.defmatrix.matrix

In [57]:
word_total_counts

matrix([[ 93, 300,   1, ...,   1,   4,   1]], dtype=int64)

In [59]:
print(word_total_counts[0,1])

300


In [81]:
idx_sorted = np.argsort(word_total_counts)
print(idx_sorted.shape)
print(idx_sorted[0, -10:])
print(idx_sorted[0, :10])
words_list = vocab.get_feature_names()

most_freq_words  = [words_list[idx_sorted[0,i]] for i in range(-10,0)]
least_freq_words = [words_list[idx_sorted[0,i]] for i in range(10)]
print(most_freq_words)
print(least_freq_words)

(1, 75911)
[[73209 67222 67505 33505 35211 35099 68091 47352  3375 67244]]
[[37955 39352 39354 39359 39360 39361 39366 39376 39346 39383]]
['was', 'that', 'this', 'in', 'it', 'is', 'to', 'of', 'and', 'the']
['labourers', 'lilienthal', 'lilililililii', 'lilleheie', 'lillete', 'lilley', 'lillihamer', 'liman', 'likings', 'limbless']


In [84]:
words_list[39383]

'limbless'

### Choose Machine Learning Algorithm and Start Training

In [None]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(x)

#load test data
review_test = load_files('HowToBuild/aclImdb/test/', 
                         categories=['neg','pos'], shuffle=False)
X_test, y = review_test.data, review_test.target
x_test = Coun

### Test Model Accuracy and Improve the Model