In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# Text preprocessing
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re

# Building a model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Step 1: Load all the data from the dataset csv file (1.6 millions tweets)        
dataset_filepath = "../input/sentiment140/training.1600000.processed.noemoticon.csv"
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]


twitter_df = pd.read_csv(dataset_filepath, encoding = DATASET_ENCODING, names = DATASET_COLUMNS, header = None)
labels = twitter_df.iloc[:, 0]
sentences = twitter_df.iloc[:, -1]
twitter_df.head()

In [3]:
# Step 2: Preprocessing of sentences to handle
# 1. Removing URLs
# 2. Removing Emails
# 3. Remove new lines characters
# 4. Remove distracting single quotes
# 5. Remove capitalization of words
# 6. Remove all twitter handles
# 7. Removal of stop words
# 8. Remove punctuation


tweet_tokenizer = TweetTokenizer(strip_handles=True)
punct_tokenizer = RegexpTokenizer(r'\w+')
detokenizer = TreebankWordDetokenizer()
stop_words = set(stopwords.words('english'))

def depure_data(data): 
    
    global tweet_tokenizer, stop_words, punct_tokenizer, detokenizer
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
    
    # Remove capitalization of words
    data = data.lower()
    
    # Removal of twitter handles
    word_list =  tweet_tokenizer.tokenize(data)
    
    # Removal of stop words
    #filtered_stop_words = [word for word in word_list if word not in stop_words]
    data = detokenizer.detokenize(word_list)
    
    # Remove punctuation
    filtered_punct_words = punct_tokenizer.tokenize(data)
    data = detokenizer.detokenize(filtered_punct_words)
    
    
    return data


In [4]:
sentences = sentences.apply(lambda x: depure_data(x))
sentences.head()

In [5]:
sentences = list(sentences)

labels = np.array(list(labels))

# Changing the 4s to 1s to make training easier
four_indices = np.where(labels == 4)
labels[four_indices[0]] = 1

In [6]:
# Step 2: We load up the words for tokenization and the vectors for words embeddings
glove_vector_file_path = "../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt"

words = []
embeddings_index = {}

with open(glove_vector_file_path, "r") as GVF:
    for current_row,row in enumerate(GVF):
        line = row.split()
        
        word = line[0]
        coefs = np.asarray(line[1:], dtype = "float32")
        
        embeddings_index[word] = coefs
        
        

In [7]:
# Intermediate Step 1: Defining all the hyper parameters
trunc_type = "post"
pad_type = "post"
oov_tok = "<OOV>"

embedding_dim = 100
max_len = 50

train_batch_size =  2**10 # 1024 samples per batch (1,280,000 total train samples)
val_batch_size = 2**7 # 128 samples per batch (160,000 total validation samples)
no_of_epochs = 50

In [10]:
# Step 3: Carry out tokenization of words and convert into integer sequences
word_tokenizer = Tokenizer(oov_token = oov_tok)
word_tokenizer.fit_on_texts(sentences)

print(len(word_tokenizer.word_index))

total_sequences = word_tokenizer.texts_to_sequences(sentences)
data = pad_sequences(total_sequences, padding = pad_type, truncating = trunc_type, maxlen = max_len)

print(len(data))


In [11]:
# Step 4: We split the data into a train and test set
train_portion = 0.8

indices = np.arange(len(data))
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

train_size = int(0.8*len(data))
performance_checking_size = len(data) - train_size
val_and_test_size = int(0.5 * performance_checking_size)

train_sentences = data[:train_size]
train_labels = labels[:train_size]

validation_sentences = data[train_size: train_size + val_and_test_size]
validation_labels = labels[train_size: train_size + val_and_test_size]

test_sentences = data[train_size + val_and_test_size: train_size + val_and_test_size * 2]
test_labels = labels[train_size + val_and_test_size: train_size + val_and_test_size * 2]

print(len(train_sentences))
print(len(validation_sentences))
print(len(test_sentences))

In [12]:
# Step 5: We create the weights matrix for the embedding layer
embedding_vocab_size = len(word_tokenizer.word_index) + 1

embedding_matrix = np.zeros((embedding_vocab_size, embedding_dim))
for word, i in word_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [23]:
# Step 6: Create a callback function to early stop the training when a desired accuracy is reached
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = {}):
        if logs["val_accuracy"] > 0.82:
            model.stop_training = True
            print(f"\n Stopping training...desired validation accuracy of {logs['val_accuracy'] * 100}% reached!")
            
callback = myCallback()

In [18]:
# Step 7: Define the model and compile it

model = tf.keras.Sequential([
                            tf.keras.layers.Embedding(embedding_vocab_size, embedding_dim, weights = [embedding_matrix], trainable = False, input_length = max_len),
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40)),
                            
                            tf.keras.layers.Dense(128, activation = "relu"),
                            tf.keras.layers.Dense(2, activation = "sigmoid")
])

model.compile(loss = "sparse_categorical_crossentropy", optimizer = Adam(), metrics = ["accuracy"])

model.summary()

In [24]:
# Step 8: Train the model over a certain number of epochs
history = model.fit(train_sentences, train_labels, batch_size = train_batch_size,
                    validation_data = (validation_sentences, validation_labels), validation_batch_size = val_batch_size,
                    epochs = no_of_epochs, callbacks = [callback])


In [25]:
# Step 9: Evaluate the model on a test set
results = model.evaluate(test_sentences, test_labels, batch_size = 128)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

In [30]:
# Bonus step: Testing model on your own data
temp_sentence = ["I hate this world so much that I want to tear it all apart!", #0
                 "What a marvelous exhibit this is!", #1
                 "You broke my trust!", #0
                "you are looking interestingly bad"] #0
temp_sequence = word_tokenizer.texts_to_sequences(temp_sentence)
temp_data = pad_sequences(temp_sequence, padding = pad_type, truncating = trunc_type, maxlen = max_len)

model.predict(temp_data)

In [28]:
# Step 10: Plot the loss and accuracy to visual how well the model performs
val_loss = history.history["val_loss"]
loss = history.history["loss"]

x = np.arange(len(val_loss))
plt.plot(x, loss)
plt.plot(x, val_loss)
plt.legend()

In [29]:
# Final step: Saving the model
model.save("./bidirectional_LSTM_layer_model_finalized.h5")