In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from keras import utils

In [2]:
# for local
# processed_directory = '../processed_data/'

# for google drive
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/cs4248-project/'
    print(path_to_file)
    # move to Google Drive directory
    os.chdir(path_to_file)
    !pwd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/cs4248-project/
/content/gdrive/My Drive/cs4248-project


In [3]:
data_path = '/content/gdrive/My Drive/cs4248-project/processed_data'
train_df = pd.read_json(data_path+'/train.json')
test_df = pd.read_json(data_path+'/test.json')

In [4]:
train_df.head()

Unnamed: 0,id,text,rating,label
0,0,Bromwell High is a cartoon comedy. It ran at t...,9,+
1,10000,Homelessness (or Houselessness as George Carli...,8,+
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,+
3,10002,This is easily the most underrated film inn th...,7,+
4,10003,This is not the typical Mel Brooks film. It wa...,8,+


In [5]:
test_df.head()

Unnamed: 0,id,text,rating,label
0,0,I went and saw this movie last night after bei...,10,+
1,10000,Actor turned director Bill Paxton follows up h...,7,+
2,10001,As a recreational golfer with some knowledge o...,9,+
3,10002,"I saw this film in a sneak preview, and it is ...",8,+
4,10003,Bill Paxton has taken the true story of the 19...,8,+


In [6]:
mapping = {'+': 0, '-': -1}

train_x = train_df["text"]
train_y = train_df.replace({'label': mapping})
train_y = train_y["label"]

test_x = test_df["text"]
test_y = test_df.replace({'label': mapping})
test_y = test_y["label"]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [9]:
train_y

0        0
1        0
2        0
3        0
4        0
        ..
24995   -1
24996   -1
24997   -1
24998   -1
24999   -1
Name: label, Length: 25000, dtype: int64

In [10]:
sequences = tokenizer.texts_to_sequences(x_train)

In [11]:
# pad sequence to longest sequence
train_x_arr = [sentence.split() for sentence in x_train]
longest_seq = max(len(arr) for arr in train_x_arr)

print(longest_seq)

2470


In [20]:
num_classes = 2

X_train = pad_sequences(sequences, maxlen=longest_seq, padding='post')
Y_train = tf.keras.utils.to_categorical(y_train, num_classes)

In [13]:
X_train

array([[  51,   10,  384, ...,    0,    0,    0],
       [  49, 1555,    2, ...,    0,    0,    0],
       [   3,  865,   19, ...,    0,    0,    0],
       ...,
       [  10,   25,  630, ...,    0,    0,    0],
       [   3,  752,    4, ...,    0,    0,    0],
       [1938,   10,   64, ...,    0,    0,    0]], dtype=int32)

In [14]:
Y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [15]:
sequences_test_subset = tokenizer.texts_to_sequences(x_test)

X_test_subset = pad_sequences(sequences_test_subset, maxlen=longest_seq, padding='post')
Y_test_subset = tf.keras.utils.to_categorical(y_test, num_classes)

In [16]:
sequences_test = tokenizer.texts_to_sequences(test_x)

X_test = pad_sequences(sequences_test, maxlen=longest_seq, padding='post')
Y_test = tf.keras.utils.to_categorical(test_y, num_classes)

In [17]:
vocab_size = len(tokenizer.word_index) + 1

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=longest_seq))
model.add(Conv1D(filters=16, kernel_size=4, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation="relu"))
model.add(Dense(2, activation="softmax"))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2470, 100)         8007600   
                                                                 
 conv1d_1 (Conv1D)           (None, 2467, 16)          6416      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1233, 16)         0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 19728)             0         
                                                                 
 dense_2 (Dense)             (None, 10)                197290    
                                                                 
 dense_3 (Dense)             (None, 2)                 22        
                                                      

In [22]:
history = model.fit(X_train, Y_train, epochs=9, batch_size=20)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [23]:
loss, accuracy = model.evaluate(X_test_subset, Y_test_subset)

print('Test Accuracy: %f' % (accuracy*100))

Test Accuracy: 88.520002


In [24]:
loss, accuracy = model.evaluate(X_test, Y_test)

print('Test Accuracy: %f' % (accuracy*100))

Test Accuracy: 86.383998


In [25]:
def get_verdict(x):
    if x == 'A':
        return 0
    else:
        return -1

y_pred = model.predict(X_test_subset)
df = pd.DataFrame(y_pred, columns = ['A', 'B'])

y_pred = pd.DataFrame.idxmax(df, axis=1)
y_pred = y_pred.apply(lambda x: get_verdict(x)).to_frame()

y_pred = y_pred.to_numpy().flatten()

y_pred

score = f1_score(y_test, y_pred, average='macro')
print('f1 score = {}'.format(score))

f1 score = 0.8851720554714195


In [26]:
y_pred = model.predict(X_test)
df = pd.DataFrame(y_pred, columns = ['A', 'B'])

y_pred = pd.DataFrame.idxmax(df, axis=1)
y_pred = y_pred.apply(lambda x: get_verdict(x)).to_frame()

y_pred = y_pred.to_numpy().flatten()

y_pred

score = f1_score(test_y, y_pred, average='macro')
print('f1 score = {}'.format(score))

f1 score = 0.8638364305537252
