# TODO
- [ ] Try other tokenizers [The TensorFlow Text library also implements various tokenization strategies, including WordPiece8 (a variant of BPE) https://medium.com/tensorflow/introducing-tf-text-438c8552bd5e]
- [ ] Handle Emoticons
- [ ] Handle Emojis
'🙂'.encode()
b'\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xf0\x9f\x98\x82'.decode()

In [1]:
import tensorflow as tf
import numpy as np 
import random 
random.seed(0)
tf.random.set_seed(0)
np.random.seed(0)
tf.keras.utils.set_random_seed(0)

In [36]:
with open('./Data/neg_u_13k.csv') as f:
    neg_lines = f.readlines()
with open('./Data/neu_u_13k.csv') as f:
    neu_lines = f.readlines()
with open('./Data/pos_u_13k.csv') as f:
    pos_lines = f.readlines()

neg_tweets = [ i[2:-2] for i in neg_lines ]
neu_tweets = [ i[2:-2] for i in neu_lines ]
pos_tweets = [ i[2:-2] for i in pos_lines ]

tweets = neg_tweets + neu_tweets + pos_tweets

emoji_list1 = [':)', ':-)', ':p', ':P', ':-p', ':-P', ':b', ':D', ":')", ":'-)", ':3', '<3', ':]', '=)', 'xD', 'XD', '8D', '=D', ':*', ':-*', ':x', ';)', ';-)', '^_^']
emoji_list2 = [':(', ':-(', ":'(",':O', ':o', ':c', ':C', "D-':",'D:<','D:','D8','D;','D=','DX', ':\\', '=/', '=\\', ':X', ':$']

# cleanup and transformation
import emoji 

import nltk
from nltk.stem import PorterStemmer
nltk.download('wordnet')
stemmer = PorterStemmer()

def handle_emoticon_and_emoji(tweet):

    # converting emoticons to happy/sad
    for i in emoji_list1:
      tweet = tweet.replace(i, ' happy ')
    for i in emoji_list2:
      tweet = tweet.replace(i, ' sad ')

    # converting emojis to text and sepatating them
    tokens = tweet.split()
    emoji_tokens = []
    for index, token in enumerate(tokens):
      if token.startswith('\\'):
        try:
          transformed_emoji = emoji.demojize(token.encode().decode('unicode_escape').encode('latin').decode()).replace('::', ': :')
          tokens[index] = transformed_emoji
        except:
          pass

    tokens.extend(emoji_tokens)
    return tokens


for index, tweet in enumerate(tweets):
    
    tokens = tweet.split()

    stemmed_tweet = " ".join([stemmer.stem(token) for token in tokens])

    emoji_transformed_tweet_tokens = handle_emoticon_and_emoji(stemmed_tweet)

    tweets[index] = ' '.join([ i for i in emoji_transformed_tweet_tokens if len(i) > 1 and not i.startswith('http') and not i.startswith('@') ])
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vahid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
data = tf.convert_to_tensor(tweets)

labels = tf.convert_to_tensor(np.concatenate([np.zeros(len(neg_tweets)) - 1,
                                              np.zeros(len(neu_tweets)) ,
                                              np.zeros(len(pos_tweets)) + 1,
                                             ]) )
labels = tf.keras.utils.to_categorical(labels, 3)  # one-hot encoding the labels

dataset = tf.data.Dataset.from_tensor_slices((data, labels))
dataset = dataset.shuffle(len(dataset), seed=0).batch(32)

# Define the split ratios
train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15 

# Split the dataset
train_size = int(train_ratio * len(dataset))
val_size = int(val_ratio * len(dataset))
test_size = int(test_ratio * len(dataset))

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)
test_dataset = dataset.skip(train_size + val_size).take(test_size)

### Using RNN -> 93% Validation Accuracy

In [42]:
# vocab_size: 1000 -> in 40 epochs would reach 91% val accuracy
# vocab_size: 2000 -> in 40 epochs would reach 92% val accuracy
# vocab_size: 3000 -> in 40 epochs would reach 93% val accuracy

vocab_size = 3000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_dataset.map(lambda tweets, labels: tweets))
# text_vec_layer.get_vocabulary()


embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(3, activation="softmax")
])
model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

# Custom callback to display accuracy during training
class AccuracyCallback(tf.keras.callbacks.Callback):
    def on_batch_end(self, batch, logs=None):
        print(f"batch: {batch}, Accuracy: {logs['accuracy']*100}", end='\r')
    def on_epoch_end(self, epoch, logs=None):
        print(f"epoch: {epoch + 1}, Training Accuracy (last-batch): {round(logs['accuracy']*100, 2)}, Validation Accuracy (epoch): {round(logs['val_accuracy']*100, 2)}")

history = model.fit(train_dataset, validation_data=val_dataset, epochs=35, verbose=0, callbacks=[AccuracyCallback()])  

epoch: 1, Training Accuracy (last-batch): 57.74, Validation Accuracy (epoch): 65.32
epoch: 2, Training Accuracy (last-batch): 64.14, Validation Accuracy (epoch): 67.99
epoch: 3, Training Accuracy (last-batch): 66.11, Validation Accuracy (epoch): 69.62
epoch: 4, Training Accuracy (last-batch): 67.87, Validation Accuracy (epoch): 71.14
epoch: 5, Training Accuracy (last-batch): 68.87, Validation Accuracy (epoch): 72.27
epoch: 6, Training Accuracy (last-batch): 70.35, Validation Accuracy (epoch): 73.79
epoch: 7, Training Accuracy (last-batch): 71.37, Validation Accuracy (epoch): 76.02
epoch: 8, Training Accuracy (last-batch): 72.7, Validation Accuracy (epoch): 76.6
epoch: 9, Training Accuracy (last-batch): 74.41, Validation Accuracy (epoch): 79.13
epoch: 10, Training Accuracy (last-batch): 75.59, Validation Accuracy (epoch): 79.25
epoch: 11, Training Accuracy (last-batch): 76.98, Validation Accuracy (epoch): 79.74
epoch: 12, Training Accuracy (last-batch): 78.2, Validation Accuracy (epoch)

In [43]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=15, verbose=0, callbacks=[AccuracyCallback()])  

epoch: 1, Training Accuracy (last-batch): 91.74, Validation Accuracy (epoch): 93.41
epoch: 2, Training Accuracy (last-batch): 91.84, Validation Accuracy (epoch): 93.5
epoch: 3, Training Accuracy (last-batch): 91.91, Validation Accuracy (epoch): 93.7
epoch: 4, Training Accuracy (last-batch): 92.01, Validation Accuracy (epoch): 93.65
epoch: 5, Training Accuracy (last-batch): 92.17, Validation Accuracy (epoch): 93.72
epoch: 6, Training Accuracy (last-batch): 92.32, Validation Accuracy (epoch): 93.67
epoch: 7, Training Accuracy (last-batch): 92.39, Validation Accuracy (epoch): 93.7
epoch: 8, Training Accuracy (last-batch): 92.49, Validation Accuracy (epoch): 94.12
epoch: 9, Training Accuracy (last-batch): 92.48, Validation Accuracy (epoch): 93.58
epoch: 10, Training Accuracy (last-batch): 92.44, Validation Accuracy (epoch): 93.97
epoch: 11, Training Accuracy (last-batch): 92.56, Validation Accuracy (epoch): 93.78
epoch: 12, Training Accuracy (last-batch): 92.56, Validation Accuracy (epoch)

### Using a pretrained language model  -> 96% Validation Accuracy

In [8]:
import os
import tensorflow_hub as hub
os.environ["TFHUB_CACHE_DIR"] = "./tfhub_cache"
model = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
        trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])
model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_dataset, validation_data=val_dataset, epochs=5, verbose=0, callbacks=[AccuracyCallback()])  

batch: 55, Accuracy: 53.069198131561286

KeyboardInterrupt: 