In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import tensorflow as tf
import tarfile
from tensorflow.compat.v1.keras import backend as K
from keras.preprocessing import sequence # helper module to handle padding
from keras.models import Sequential # the model
from keras.layers import Dense, Dropout, Activation # layer objects
from keras.layers import Conv1D, GlobalMaxPooling1D # convolution layer and pooling
import numpy as np
import os
import glob
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath

# load pre-trained google news vectors
word2vec_path = '/content/drive/My Drive/machine_learning/train/GoogleNews-vectors-negative300.bin.gz'
word_vectors = KeyedVectors.load_word2vec_format(datapath(word2vec_path), binary=True, limit=200000)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
# load the data
data = '/content/drive/My Drive/machine_learning/train'

In [9]:

# preprocess movie reviews
def preprocess_data(filepath):
    # join paths
    positive_path = os.path.join(filepath, 'pos/pos')
    negative_path = os.path.join(filepath, 'neg/neg')
    
    # create labels
    pos_label = 1
    neg_label = 0
    
    # make list for dataset
    dataset = []
    
    # use glob to find all the pathnames matching a specified pattern
    # read each txt-file in the the folder for positive reviews
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        
        # append review and label as tuple to dataset 
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    
    # do the same for the negative reviews
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))  
            
     # shuffle position of positive + negative reviews
    shuffle(dataset)
    return dataset


In [10]:
# tokenize + transform to word vectors    
def tokenize_and_vectorize(dataset):
    # tokenizer object
    tokenizer = TreebankWordTokenizer()
    
    # list to store vectorized data
    vectorized_data = []
    
    # iterate over reviews
    for sample in dataset:
        
        # tokenize the review
        tokens = tokenizer.tokenize(sample[1])
        
        # make list to store 
        token_vectors = []
        
        # iterate over the token list of each review
        for token in tokens:
            
            # look up the word vector and append to token_vectors list
            try:
                token_vectors.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab
       
        # now append the vectorized review to a common list vectorized_data
        vectorized_data.append(token_vectors)

    return vectorized_data
            
    
   

In [11]:
# collect target values in the same order as reviews
def collect_label(dataset):
    
    # make file to store neg or pos label
    label = []
    
    # iterate over dataset and get the label
    for sample in dataset:
        # append
        label.append(sample[0])
        
    return label

In [12]:
# use functions on dataset

# returns list of tuples (1/0, review)
preprocessed_data = preprocess_data(data)

# takes the list of tuples, vectorizes it, returns list of vectors
vectorized_data = tokenize_and_vectorize(preprocessed_data)

# returns list of labels
expected_label = collect_label(preprocessed_data)

In [13]:
# train/test split

# split point: # 80%-20% split
split_point = int(len(vectorized_data)*.8) 

# 80% train data: x and y label
x_train = vectorized_data[:split_point]
y_train = expected_label[:split_point]

# the rest (20%) test data: x and y label
x_test = vectorized_data[split_point:]
y_test = expected_label[split_point:]

In [20]:
# set CNN parameters
maxlen = 400  # length of reviews: 400 tokens (trunacte longer, pad shorter)
batch_size = 30 # how many samples to process before backprop
embedding_dims = 300 # length of the token vectors to pass to convnet
filters = 250 # num of filters to train
kernel_size = 3 # width of the filter (three tokens)
hidden_dims = 250 # num of neurons in the feedforward net at the end
epochs = 30 # num of times to pass the entire train set to network

In [15]:
# pad and truncate token sequence (seqs of vectors)
# note: pad_sequences by keras can only be used for seqs of scalars

def pad_or_truncate(data, maxlen):
    
    # make list to store padded/truncated data
    new_data = []
    
    # Create a vector of 0s the length of the word vectors
    zero_vector = []
    for _ in range(len(data[0][0])): # append 300 0s to word vector
        zero_vector.append(0.0)
        
   # iterate over list of vectors     
    for sample in data:
        
        # if word vector longer than 400
        if len(sample) > maxlen:
            # cut from 0 to 400
            temp = sample[:maxlen]
            
        # if word vector smaller than 400
        elif len(sample) < maxlen:
            # leave as it is 
            temp = sample
            
            #  Pad by appending 0 vectors to the list
            additional_elems = maxlen - len(sample)
            
            for _ in range(additional_elems):
                temp.append(zero_vector)
                
        # if word vector is exactly 400, leave as it is 
        else:
            temp = sample
        
        # append the padded/truncated vector to list new_data
        new_data.append(temp)
            
    return new_data


In [16]:
# pad or truncate, returns a list of equally-sized word vectors
x_train = pad_or_truncate(x_train, maxlen)
x_test = pad_or_truncate(x_test, maxlen)

# reshape to arrays
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)


In [24]:
# build model
model = Sequential() # base class

# add convolutional layer
model.add(Conv1D(filters,
                 kernel_size, # window width = 3 tokens
                 padding='valid', # valid padding
                 activation='relu',
                 strides=1, # shift of one token
                 input_shape=(maxlen, embedding_dims)))

# add max pooling: take largest activation value for the given region
model.add(GlobalMaxPooling1D())

# add a fully-connected dense layer with dropout
model.add(Dense(hidden_dims))
model.add(Dropout(0.1))
model.add(Activation('relu'))

#  single-unit output layer
model.add(Dense(1))
model.add(Activation('sigmoid'))



In [25]:
# compile builds the model
model.compile(loss='binary_crossentropy', # binary because only one output neuron
              optimizer='adam',
              metrics=['accuracy'])

In [26]:
# fit trains the model
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=2,
          validation_data=(x_test, y_test))

Epoch 1/30
118/118 - 38s - loss: 0.5531 - accuracy: 0.6991 - val_loss: 0.4134 - val_accuracy: 0.8000
Epoch 2/30
118/118 - 37s - loss: 0.2658 - accuracy: 0.8919 - val_loss: 0.3040 - val_accuracy: 0.8864
Epoch 3/30
118/118 - 37s - loss: 0.0940 - accuracy: 0.9764 - val_loss: 0.2950 - val_accuracy: 0.8807
Epoch 4/30
118/118 - 37s - loss: 0.0190 - accuracy: 0.9991 - val_loss: 0.3187 - val_accuracy: 0.8818
Epoch 5/30
118/118 - 36s - loss: 0.0052 - accuracy: 1.0000 - val_loss: 0.3306 - val_accuracy: 0.8841
Epoch 6/30
118/118 - 37s - loss: 0.0025 - accuracy: 1.0000 - val_loss: 0.3450 - val_accuracy: 0.8864
Epoch 7/30
118/118 - 36s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.3560 - val_accuracy: 0.8830
Epoch 8/30
118/118 - 37s - loss: 0.0010 - accuracy: 1.0000 - val_loss: 0.3665 - val_accuracy: 0.8830
Epoch 9/30
118/118 - 37s - loss: 7.6675e-04 - accuracy: 1.0000 - val_loss: 0.3751 - val_accuracy: 0.8841
Epoch 10/30
118/118 - 37s - loss: 5.8240e-04 - accuracy: 1.0000 - val_loss: 0.3802 - va

<tensorflow.python.keras.callbacks.History at 0x7f13f39045c0>