In [43]:
from __future__ import absolute_import, print_function, unicode_literals, division
import os, sys

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

In [2]:
os.listdir('aclImdb')

['.DS_Store', 'imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

## Load the IMBD Data

In [3]:
data_train = pd.DataFrame(columns=['text', 'target'])
data_test = pd.DataFrame(columns=['text', 'target'])

In [4]:
"""
Train data
"""
basic_path = os.path.join(*['aclImdb', 'train', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] train \'pos\' data loaded')
        
basic_path = os.path.join(*['aclImdb', 'train', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_train = data_train.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] train \'neg\' data loaded')


"""
Test data
"""       
basic_path = os.path.join(*['aclImdb', 'test', 'pos'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 1
        }, ignore_index=True)
print('[info] test \'pos\' data loaded')

basic_path = os.path.join(*['aclImdb', 'test', 'neg'])
for file in os.listdir(basic_path)[:500]:
    with open(os.path.join(basic_path, file), 'r') as f:
        data_test = data_test.append({
            'text': f.read(),
            'target': 0
        }, ignore_index=True)
print('[info] test \'neg\' data loaded')

[info] train 'pos' data loaded
[info] train 'neg' data loaded
[info] test 'pos' data loaded
[info] test 'neg' data loaded


In [12]:
data_train = shuffle(data_train).reset_index(drop=True)
data_test = shuffle(data_test).reset_index(drop=True)

data_train.to_csv('aclImdb/train.csv', index=False)
data_train.to_csv('aclImdb/test.csv', index=False)

In [15]:
data_train

Unnamed: 0,text,target
0,"Another Aussie masterpiece, this delves into t...",1
1,I borrowed this movie from library think it mi...,0
2,Cuban Blood is one of those sleeper films that...,1
3,`Castle of Blood' (aka `Castle of Terror') is ...,1
4,"This movie is so bad, I knew how it ends right...",0
...,...,...
995,I can hardly believe I watched this again last...,0
996,for whoever play games video games here did an...,1
997,A thematic staple of cinema since its inceptio...,0
998,"As this movie is completely in Swiss dialect, ...",1


## Working with Word Embeddings

In [45]:
vectorizer = CountVectorizer()

In [49]:
vectorizer.fit(['Hello, world!'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [40]:
encoder = tfds.features.text.Tokenizer()

In [42]:
encoder.tokenize('Hello, world!')

['Hello', 'world']