# IMDB Dataset Experiments continued 

Import all modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import os
import time
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('imdb_d-set_train.csv')
df_test = pd.read_csv('imdb_d-set_test.csv')

Data Cleaning and Processing.

In [3]:
import nltk
import re

dataSet_train = df_train ['data']
dataSet_test = df_test ['data']

#Lowercase all data
dataSet_train= dataSet_train.map(lambda x: x.lower())
dataSet_test= dataSet_test.map(lambda x: x.lower())

#Remove-Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWord = stopwords.words('english')

dataSet_train.apply(lambda x: [item for item in x if item not in stopWord])
dataSet_test.apply(lambda x: [item for item in x if item not in stopWord])

#Cleanup
def cleanup(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return re.sub('\[[^]]*\]', '', text)


dataSet_train = dataSet_train.apply(cleanup)
dataSet_test = dataSet_test.apply(cleanup)

#Lemma
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()
dataSet_train = [wordnet_lemmatizer.lemmatize(word) for word in dataSet_train]
dataSet_test = [wordnet_lemmatizer.lemmatize(word) for word in dataSet_test]

#Stem
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
dataSet_train = [stemmer.stem(word) for word in dataSet_train]
dataSet_test = [stemmer.stem(word) for word in dataSet_test]

#Fit-data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

train_label = df_train['label']
Y_train = train_label.astype('int')
vector = CountVectorizer(binary=True, ngram_range=(1,2))
X_train = vector.fit_transform(dataSet_train)
#tfidf_transformer = TfidfTransformer()
#X_train = tfidf_transformer.fit_transform(train_data)

test_label = df_test['label']
Y_test = train_label.astype('int')
X_test = vector.transform(dataSet_test)
#X_test = tfidf_transformer.transform(test_data)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from keras import Sequential
from keras import layers

Using TensorFlow backend.


In [5]:
from sklearn.utils import shuffle

X_train,Y_train= shuffle (X_train,Y_train, random_state = 10)

Add Layers and compile

In [12]:
input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
               optimizer='AdaGrad', 
               metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 10)                16464840  
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 16,464,851
Trainable params: 16,464,851
Non-trainable params: 0
_________________________________________________________________


Train the model

In [13]:
history = model.fit(X_train, Y_train,
                     epochs=1,
                     verbose=1,
                     validation_split=1,
                     batch_size=200)

Epoch 1/1


In [14]:
loss, accuracy = model.evaluate(X_test, Y_test, verbose=True)
print("Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.9083
