In [0]:
# Downloading the test and training the data
!wget -q https://l1nna.com/372/Assignment/A2-3/train.csv
!wget -q https://l1nna.com/372/Assignment/A2-3/test.csv

In [0]:
import pandas as pd
import csv

# Reading the csv files into the variables using pandas so they can be used in the notebook
#   index_col='id': index column based on id as it is already unique
xy_train_df = pd.read_csv('train.csv')
x_test_df  = pd.read_csv('test.csv' , index_col='id')

# Add a new column 'length' for the training dataset containing the length of the review for each observation
xy_train_df['length'] = xy_train_df.apply(lambda x: len(x.review), axis=1)

# Sort the observations based on the length 
xy_train_df = xy_train_df.sort_values('length')
xy_train_df

Unnamed: 0,id,rating,review,length
6037,2596,1,Five Stars_GOOD,15
5353,4643,1,Love it_Love it,15
2545,8791,1,Five Stars_Good,15
3902,6098,1,Five Stars_love!,16
2850,4609,1,love these_so cute!,19
...,...,...,...,...
5651,518,1,"So far, it's awesome_Ok, so I'll say up front ...",5765
1615,124,1,It Works (Read Tips For Potential Effectivenes...,6740
5046,7257,1,An exquisitely effective product with an astou...,8082
4859,7555,1,Gorgeous professional looking manicure at home...,8134


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

vocab_size = 10000
# Maximum number of word in text
max_len = 256

# Split the original training data (xy_train_df) into 20% of xy_validation (validation set) and 80% of xy_train (training set)
xy_train, xy_validation = train_test_split(xy_train_df, test_size=0.2)

# build vocabulary from training set
#   Tokenizer: based on the data, removes any of the following characters “!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"
#   fit_on_texts: Updates internal vocabulary based on a list of texts.
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(xy_train.review)

# padding is done inside: 
#   Convert a list of texts (review) to a Numpy matrix with 256 columns with binary values
#     mode='binary': Whether the word exists (1) or not (0) in the text. Helps in teaching the model.
x_train = tokenizer.texts_to_matrix(xy_train.review, mode='binary')[:, :max_len]
y_train = xy_train.rating

x_valid = tokenizer.texts_to_matrix(xy_validation.review, mode='binary')[:, :max_len]
y_valid = xy_validation.rating

x_test = tokenizer.texts_to_matrix(x_test_df.review, mode='binary')[:, :max_len]

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(4978, 256)
(1245, 256)
(2667, 256)


In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras


import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# model: Sequential model is a linear stack of layers
model = keras.Sequential()
# Embedding
#   Given the number of unqiue words (vocab_size) and size of embedding vector
#   Create a table of vector size (10000, 2) with 20 weights for the embedding vector for each unique word
model.add(keras.layers.Embedding(vocab_size, 20))


### We are adding layers in the model

#---- CuDNNGRU works only for tf 1.x ----#
#   Parameter=100: 100 dimensionality of the output space
#   CuDNNLSTM: Fast LSTM implementation backed by cuDNN
#       Shape is (8 * 100) based on the source code on GitHub
model.add(tf.compat.v1.keras.layers.CuDNNLSTM(100))

# Fully connected layers are defined using Dense
# Output layer has one node and uses the sigmoid activation function
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

## For the model
# Using loss function of binary_crossentropy to evaluate a set of weights
# Optimizer Adam is used to search through different weights for the network
# clipnorm=4: All parameter gradients will be clipped to a maximum norm of 4. 
#             Helps in preventing messing up of parameters due to vanishing/exploding gradients
model.compile(
    optimizer=Adam(clipnorm=4.),
    loss='binary_crossentropy',
    metrics=['accuracy'])


# Train on x_train as input and y_train as output
# Use validation set to tune the parameter
# batch_size is the number of samples per gradient update
# 15 epochs to train the model. An epoch is an iteration over the entire x and y data provided.
# verbose as 1 to see progress bar
history = model.fit(x_train,
                    y_train,
                    epochs=15,
                    batch_size=64,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [0]:
# Evaluate the model based on validation set.
model.evaluate(x_valid, y_valid)



[0.3693195879459381, 0.8811244964599609]

In [0]:
y_predict = np.squeeze(model.predict_classes(x_valid),  axis=-1)

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

0.8811244979919679


In [0]:
# run on testing set:
y_predict = np.squeeze(model.predict_classes(x_test), axis=-1)

# Convert output by adding the predicted id and rating for the test and add to file for sample_submission.csv
pd.DataFrame(
    {'id': x_test_df.index,
     'rating':y_predict}).to_csv('sample_submission.csv', index=False)