In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import random
import pickle

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seanmhiggins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
X = list(df['review'])
y = []
for i in df['sentiment']:
    if i == 'positive':
        y.append(1)
    else:
        y.append(0)

In [4]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9()!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

for i in range(len(X)):
    review = X[i]
    X[i] = clean_str(review)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


NUM_WORDS = 20000

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 111843 unique tokens.


In [6]:
X = pad_sequences(sequences, maxlen=30000)

In [7]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (50000, 30000)
Shape of label tensor: (50000, 1)


In [8]:
from keras.utils import np_utils

y = np_utils.to_categorical(y)

In [9]:
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.25)

print (len(X_train))
print (len(X_val))
print (len(X_test))
# 60, 20, 20 split

30000
10000
10000


In [10]:
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

x_train shape: (30000, 30000)
x_test shape: (10000, 30000)


In [11]:
pickle_in = open("../../HW 3/pretrained_word2vec_model","rb")
pretrained_model = pickle.load(pickle_in)

In [12]:
EMBEDDING_DIM = 300
vocabulary_size = min(len(word_index)+1, NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
        
    try:
        embedding_vector = pretrained_model[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(pretrained_model)

In [13]:
preprocessed_data = [X_train, y_train, X_val, y_val, X_test, y_test, embedding_matrix]

pickle_out = open("preprocessed_CNN_w2v.p", "wb")
pickle.dump(preprocessed_data, pickle_out)
pickle_out.close()
print('done')

done


In [14]:
import pickle
pickle_in = open("preprocessed_CNN_w2v.p","rb")
preprocessed_data = pickle.load(pickle_in)
X_train, y_train, X_val, y_val, X_test, y_test, embedding_matrix = preprocessed_data

In [15]:
from keras.layers import Embedding
EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)

In [16]:
# from keras.layers import Dense, Input, GlobalMaxPooling1D
# from keras.layers import Conv1D, MaxPooling1D, Embedding
# from keras.models import Model
# from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
# from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers


from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D

sequence_length = X_train.shape[1]
filterSize = 3
num_filters = 100
dropout = 0.5
units = 32

In [17]:
model = Sequential()

model.add(Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True))
          
model.add(Conv1D(num_filters, filterSize, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout))
model.add(Dense(units, activation='relu'))
model.add(Dense(2, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))


model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         6000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 100)         90100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                3232      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 6,093,398
Trainable params: 6,093,398
Non-trainable params: 0
____________________________________________

In [18]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy', # sparse_
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]



In [19]:
model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_data=(X_val, y_val),
         callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 30000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.callbacks.History at 0x1a1ce959d0>

In [20]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 89.17%
