## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score

import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models
from keras import layers
from keras import losses
from keras import metrics
from keras import optimizers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Dataset

In [15]:
data = pd.read_csv('/content/data_cleaned.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,label,tweet
0,0,0,when a father is dysfunctional and is so sel...
1,1,0,thanks for lyft credit i cant use cause they...
2,2,0,bihday your majesty
3,3,0,model i love u take with u all the time in u...
4,4,0,factsguide society now motivation


## EDA : show samples of data texts to find out required preprocessing steps

In [16]:
print(data['tweet'][0], '\n')
print(data['tweet'][1], '\n')
print(data['tweet'][2], '\n')
print(data['tweet'][3], '\n')
print(data['tweet'][4], '\n')
print(data['tweet'][50], '\n')

  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction   run 

  thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx    disapointed getthanked 

  bihday your majesty 

model   i love u take with u all the time in ur             

 factsguide society now    motivation 

for her bihday we got her a nose job            bihday petunia we love you    



## split datasets

In [17]:
# get features and labels
X = data['tweet']
Y = data['label']

In [49]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, stratify=Y, random_state=42)

print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

Size of x_train: (23624,)
Size of y_train: (23624,)
Size of x_test:  (5906,)
Size of y_test:  (5906,)


## tokinize text 

In [50]:
def text2words(text):
  return word_tokenize(text)

In [51]:
train_text = [text2words(t) for t in x_train]
test_text = [text2words(t) for t in x_test]

In [52]:
train_text[0]

['its',
 'actually',
 'stopped',
 'rainingfor',
 'nowso',
 'bike',
 'ride',
 'it',
 'is',
 'then',
 'cycling',
 'mtb']

## prepare data for RNN

In [53]:
vocab_sz = 1000
tok = Tokenizer(num_words=vocab_sz, oov_token='UNK')
texts = train_text + test_text
tok.fit_on_texts(texts)

In [54]:
x_train_prep = tok.texts_to_sequences(train_text)
x_test_prep = tok.texts_to_sequences(test_text)
y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')

## pad x_train to make sure they are all of the same length

In [55]:
def pad_seq(seq, maxlen):  
  return np.array(pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post'))

In [56]:
max_sequence_len = 0

for sentence in x_train:
    max_sequence_len = max(len(sentence), max_sequence_len)
  
print(max_sequence_len)

138


In [57]:
#maxlen = 138

x_train = pad_seq(x_train_prep,max_sequence_len)
x_test = pad_seq(x_test_prep, max_sequence_len)

In [59]:
print(x_train.shape)
print(x_test.shape)

(23624, 138)
(5906, 138)


In [61]:
x_train[0]

array([ 33, 507,   1,   1,   1,   1, 812,  17,  11, 170,   1,   1,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

## simple RNN Model

In [62]:
model = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(vocab_sz, 512),
    tf.keras.layers.SimpleRNN(512),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [63]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 512)         512000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 512)               262656    
                                                                 
 dense_3 (Dense)             (None, 1)                 513       
                                                                 
Total params: 1,299,969
Trainable params: 1,299,969
Non-trainable params: 0
_________________________________________________________________


In [64]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [65]:
history = model.fit(x_train, y_train, epochs=5, batch_size=128,
                    validation_data=(x_test, y_test), 
                    validation_steps=30)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [66]:
test_loss, test_acc = model.evaluate(x_test, y_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 0.25278154015541077
Test Accuracy: 0.9317643046379089
