In [1]:
import tensorflow as tf
import numpy as np 
import random 
random.seed(0)
tf.random.set_seed(0)
np.random.seed(0)
tf.keras.utils.set_random_seed(0)

In [51]:
with open('./Data/neg_u_13k.csv') as f:
    neg_lines = f.readlines()
with open('./Data/neu_u_13k.csv') as f:
    neu_lines = f.readlines()
with open('./Data/pos_u_13k.csv') as f:
    pos_lines = f.readlines()

neg_tweets = [ i[2:-2] for i in neg_lines ]
neu_tweets = [ i[2:-2] for i in neu_lines ]
pos_tweets = [ i[2:-2] for i in pos_lines ]

tweets = neg_tweets + neu_tweets + pos_tweets

# stemming
import nltk
from nltk.stem import PorterStemmer
nltk.download('wordnet')
stemmer = PorterStemmer()

for index, tweet in enumerate(tweets):
    
    tokens = tweet.split()

    stemmed_tweet_tokens = [stemmer.stem(token) for token in tokens]

    tweets[index] = ' '.join([ i for i in stemmed_tweet_tokens ]) #if len(i) > 1 and not i.startswith('http') and not i.startswith('@') ])
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vahid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
data = tf.convert_to_tensor(tweets)

labels = tf.convert_to_tensor(np.concatenate([np.zeros(len(neg_tweets)) - 1,
                                              np.zeros(len(neu_tweets)) ,
                                              np.zeros(len(pos_tweets)) + 1,
                                             ]) )
labels = tf.keras.utils.to_categorical(labels, 3)  # one-hot encoding the labels

dataset = tf.data.Dataset.from_tensor_slices((data, labels))
dataset = dataset.shuffle(len(dataset), seed=0).batch(32)

# Define the split ratios
train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15 

# Split the dataset
train_size = int(train_ratio * len(dataset))
val_size = int(val_ratio * len(dataset))
test_size = int(test_ratio * len(dataset))

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)
test_dataset = dataset.skip(train_size + val_size).take(test_size)

### Using RNN -> 93% Validation Accuracy

In [50]:
vocab_size = 3000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_dataset.map(lambda tweets, labels: tweets))
# text_vec_layer.get_vocabulary()

embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(3, activation="softmax")
])
model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

# Custom callback to display accuracy during training
class AccuracyCallback(tf.keras.callbacks.Callback):
    def on_batch_end(self, batch, logs=None):
        print(f"batch: {batch}, Accuracy: {logs['accuracy']*100}", end='\r')
    def on_epoch_end(self, epoch, logs=None):
        print(f"epoch: {epoch + 1} \t Training Accuracy (last-batch): {round(logs['accuracy']*100, 2)} \t Validation Accuracy (epoch): {round(logs['val_accuracy']*100, 2)}")

history = model.fit(train_dataset, validation_data=val_dataset, epochs=35, verbose=0, callbacks=[AccuracyCallback()])  

epoch: 1 	 Training Accuracy (last-batch): 91.83 	 Validation Accuracy (epoch): 93.6
epoch: 2 	 Training Accuracy (last-batch): 92.19 	 Validation Accuracy (epoch): 93.7
epoch: 3 	 Training Accuracy (last-batch): 92.03 	 Validation Accuracy (epoch): 93.36
epoch: 4 	 Training Accuracy (last-batch): 91.94 	 Validation Accuracy (epoch): 93.99
epoch: 5 	 Training Accuracy (last-batch): 92.26 	 Validation Accuracy (epoch): 93.41


### Using a pretrained language model  -> 95% Validation Accuracy

In [None]:
import os
import tensorflow_hub as hub
os.environ["TFHUB_CACHE_DIR"] = "./tfhub_cache"
model = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
        trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])
model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_dataset, validation_data=val_dataset, epochs=5, verbose=0, callbacks=[AccuracyCallback()])  