In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import f1_score
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from keras import utils

In [None]:
# for local
# processed_directory = '../processed_data/'

# for google drive
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/cs4248-project/'
    print(path_to_file)
    # move to Google Drive directory
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/cs4248-project/
/content/gdrive/My Drive/cs4248-project


In [None]:
data_path = '/content/gdrive/My Drive/cs4248-project/Ailanthus'
train_df = pd.read_csv(data_path+'/train_cleaned_common.csv')
test_df = pd.read_csv(data_path+'/test_cleaned_common.csv')

In [None]:
train_df.head()

Unnamed: 0,id,text,rating,label
0,0,bromwell high cartoon comedy ran time programs...,9,1
1,10000,homelessness george carlin stated issue years ...,8,1
2,10001,brilliant overacting lesley ann warren best dr...,10,1
3,10002,easily underrated film inn brooks cannon sure ...,7,1
4,10003,typical mel brooks film much less slapstick mo...,8,1


In [None]:
test_df.head()

Unnamed: 0,id,text,rating,label
0,0,went saw movie last night coaxed friends mine ...,10,1
1,10000,actor turned director bill paxton follows prom...,7,1
2,10001,recreational knowledge sport history pleased d...,9,1
3,10002,saw film sneak preview delightful cinematograp...,8,1
4,10003,bill paxton taken true story 1913 golf open ma...,8,1


In [None]:
mapping = {1: 0, -1: -1}

train_x = train_df["text"]
train_y = train_df.replace({'label': mapping})
train_y = train_y["label"]

test_x = test_df["text"]
test_y = test_df.replace({'label': mapping})
test_y = test_y["label"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)

In [None]:
train_y

0        0
1        0
2        0
3        0
4        0
        ..
24995   -1
24996   -1
24997   -1
24998   -1
24999   -1
Name: label, Length: 25000, dtype: int64

In [None]:
sequences = tokenizer.texts_to_sequences(train_x)

In [None]:
# pad sequence to longest sequence
train_x_arr = [sentence.split() for sentence in train_x]
longest_seq = max(len(arr) for arr in train_x_arr)

print(longest_seq)

1342


In [None]:
num_classes = 2

X_train = pad_sequences(sequences, maxlen=longest_seq, padding='post')
Y_train = tf.keras.utils.to_categorical(train_y, num_classes)

In [None]:
X_train

array([[21153,   204,   905, ...,     0,     0,     0],
       [22584,   557, 17099, ...,     0,     0,     0],
       [  382,  3500, 18910, ...,     0,     0,     0],
       ...,
       [  101,  4563,   120, ...,     0,     0,     0],
       [   29,  1112,  6723, ...,     0,     0,     0],
       [    3,  6841,    29, ...,     0,     0,     0]], dtype=int32)

In [None]:
Y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [None]:
sequences_test = tokenizer.texts_to_sequences(test_x)

X_test = pad_sequences(sequences_test, maxlen=longest_seq, padding='post')
Y_test = tf.keras.utils.to_categorical(test_y, num_classes)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=longest_seq))
model.add(Conv1D(filters=32, kernel_size=8, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation="relu"))
model.add(Dense(2, activation="sigmoid"))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1342, 50)          1957150   
                                                                 
 conv1d_2 (Conv1D)           (None, 1335, 32)          12832     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 667, 32)          0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 21344)             0         
                                                                 
 dense_4 (Dense)             (None, 10)                213450    
                                                                 
 dense_5 (Dense)             (None, 2)                 22        
                                                      

In [None]:
history = model.fit(X_train, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)

print('Test Accuracy: %f' % (accuracy*100))

Test Accuracy: 85.211998


In [None]:
def get_verdict(x):
    if x == 'A':
        return 0
    else:
        return -1

y_pred = model.predict(X_test)
df = pd.DataFrame(y_pred, columns = ['A', 'B'])

y_pred = pd.DataFrame.idxmax(df, axis=1)
y_pred = y_pred.apply(lambda x: get_verdict(x)).to_frame()

y_pred = y_pred.to_numpy().flatten()

y_pred

score = f1_score(test_y, y_pred, average='macro')
print('f1 score = {}'.format(score))

f1 score = 0.8521813446115024
