In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [57]:
import torch
from tqdm.notebook import tqdm

In [58]:
# import os

# os.environ['KAGGLE_USERNAME'] = ""
# os.environ['KAGGLE_KEY'] = ""

In [59]:
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

twitter-entity-sentiment-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)


In [60]:
!unzip /content/twitter-entity-sentiment-analysis.zip -d /content/

Archive:  /content/twitter-entity-sentiment-analysis.zip
replace /content/twitter_training.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [61]:
from google.colab import drive
drive.mount('content/')

Drive already mounted at content/; to attempt to forcibly remount, call drive.mount("content/", force_remount=True).


# Dataset

In [62]:
!pip install transformers



In [63]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf


In [64]:
TRAIN_PATH = '/content/twitter_training.csv'
TEST_PATH = '/content/twitter_validation.csv'

In [102]:
col_names = ['sn', 'org', 'sentiment', 'text']
train_df = pd.read_csv(TRAIN_PATH, header=None, names=col_names)
test_df = pd.read_csv(TEST_PATH, header=None, names=col_names)

In [66]:
train_df.tail()

Unnamed: 0,sn,org,sentiment,text
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [67]:
test_df.head()

Unnamed: 0,sn,org,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [68]:
print(train_df.shape)
print(test_df.shape)

(74682, 4)
(1000, 4)


In [69]:
# train_df['sentiment'].value_counts()

## label counts
-   -1    22542
-   1    20832
-   0    18318

In [103]:
label_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}

def rm_cols(df):
  df = df.dropna(subset=['text'])
  df = df.drop(['sn', 'org'], axis=1)
  df = df[~df['sentiment'].isin(['Irrelevant'])]
  df['sentiment']= df['sentiment'].map(label_mapping)
  df['text'] = df['text'].astype(str)
  return df

In [104]:
train_df = rm_cols(train_df)

In [72]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61121 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  61121 non-null  int64 
 1   text       61121 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


In [73]:
test_df = rm_cols(test_df)

In [74]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 828 entries, 1 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  828 non-null    int64 
 1   text       828 non-null    object
dtypes: int64(1), object(1)
memory usage: 19.4+ KB


In [75]:
# import transformers

# seq_len = 256
# batch_size = 16
# num_samples = len(train_df)
# model_name = 'cardiffnlp/twitter-roberta-base-sentiment'

# tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding=True, return_tensor='np')

# tokens_train = tokenizer(
#     train_df['text'].tolist(),
#     max_length=seq_len,
#     truncation=True,
#     padding='max_length',
#     add_special_tokens=True,
#     return_tensors='np'
# )



In [76]:
# num_samples_test = len(test_df)
# model_name = 'cardiffnlp/twitter-roberta-base-sentiment'

# test_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding=True, return_tensor='np')

# tokens_test = tokenizer(
#     test_df['text'].tolist(),
#     max_length=seq_len,
#     truncation=True,
#     padding='max_length',
#     add_special_tokens=True,
#     return_tensors='np'
# )

In [94]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Download the pre-trained GloVe embeddings
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load the pre-trained GloVe embeddings

# get the golve file from here
# https://drive.google.com/file/d/1Y61lrN9_6WQ_8AkPAq3IHs3I4ITMxgMh/view?usp=sharing
glove_file = '/content/content/MyDrive/nlp/glove/glove.6B.100d.txt'

def load_glove_embeddings(glove_file):
    word_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

word_vectors = load_glove_embeddings(glove_file)
# Preprocess and tokenize the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [105]:
import pandas as pd

# Assuming you have a DataFrame train_df with 'text' column

def convert_to_word_embeddings(text):
    tokens = preprocess_text(text)
    embeddings = []
    for token in tokens:
        if token in word_vectors:
            embeddings.append(word_vectors[token])
    return np.array(embeddings)

# train_df['word_embeddings'] = train_df['text'].apply(convert_to_word_embeddings)

In [79]:
convert_to_word_embeddings(["hello world", "I am the king", "[]"]).shape

(3,)

In [106]:
y_train = train_df.sentiment.values
labels_train = np.zeros((y_train.size, 3))
labels_train[np.arange(y_train.size), y_train] = 1
labels_train

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [81]:
y_train

array([2, 2, 2, ..., 2, 2, 2])

In [82]:
# labels_train.shape

In [83]:
y_test = test_df.sentiment.values
labels_test = np.zeros((y_test.size, 3))
labels_test[np.arange(y_test.size), y_test] = 1
labels_test

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [107]:
train_df['text'] = train_df['text'].apply(convert_to_word_embeddings)

# Using pretrained model

In [85]:
# from transformers import pipeline

# sent_pipeline = pipeline("sentiment-analysis")

In [86]:
# sent_pipeline('I love you')

In [108]:
import gc

gc.collect()

50183

# Building a transformer

In [109]:
import tensorflow as tf
import numpy as np

NUM_CLASSES = 3
VOCAB_SIZE = 30000


class Transformer(tf.keras.Model):
    def __init__(self, num_layers, units, d_model, num_heads, dropout, name="transformer"):
        super(Transformer, self).__init__(name=name)
        self.encoder = Encoder(num_layers, units, d_model, num_heads, dropout)
        self.dense = tf.keras.layers.Dense(units)
        self.final_dense = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')

    def call(self, inputs, training=True, mask=None):
        x = self.encoder(inputs, training=training, mask=mask)
        x = tf.reduce_mean(x, axis=1)  # Average pooling over the time dimension
        x = self.dense(x)
        x = tf.nn.relu(x)
        x = self.final_dense(x)
        return x


class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, units, d_model, num_heads, dropout, name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.num_layers = num_layers
        self.units = units
        self.d_model = d_model

        self.embedding = PositionalEmbedding(VOCAB_SIZE, d_model)

        self.enc_layers = [
            EncoderLayer(units, d_model, num_heads, dropout)
            for _ in range(num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x, training=True, mask=None):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, units, d_model, num_heads, dropout, name="encoder_layer"):
        super(EncoderLayer, self).__init__(name=name)
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads, d_model)
        self.ffn = self.point_wise_feed_forward_network(units, d_model)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)

    def call(self, x, training=True, mask=None):
        attn_output = self.mha(x, x, x, mask, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

    def point_wise_feed_forward_network(self, units, d_model):
        return tf.keras.Sequential(
            [
                tf.keras.layers.Dense(units, activation="relu"),
                tf.keras.layers.Dense(d_model),
            ]
        )


class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = self.positional_encoding()

    def positional_encoding(self):
        depth = self.d_model // 2
        length = 2048
        positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
        depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)

        angle_rates = 1 / (10000 ** depths)  # (1, depth)
        angle_rads = positions * angle_rates  # (pos, depth)

        pos_encoding = np.concatenate(
            [np.sin(angle_rads), np.cos(angle_rads)],
            axis=-1)

        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x


In [110]:
save_path = '/content/content/MyDrive'

num_layers = 6
units = 64
d_model = 100
num_heads = 5
dropout = 0.2
num_epochs = 2
batch_size = 32

model = Transformer(num_layers, units, d_model, num_heads, dropout)
input_seq = tf.keras.Input(shape=(None,))
# mask = tf.keras.Input(shape=(None, None))
# output = model(inputs=input_seq, mask=mask)
output = model(inputs=input_seq)
transformer_model = tf.keras.Model(inputs=input_seq, outputs=output)
# transformer_model = tf.keras.Model(inputs=input_seq, outputs=output)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = f"{save_path}/sentiment_checkpoint.h5",
    save_best_only = True,
    monitor = 'val_loss',
    mode = 'min',
    verbose = 1
)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(),
                          loss=tf.keras.losses.CategoricalCrossentropy(),
                          metrics=['accuracy'])

# transformer_model.fit(
#     [tokens_train['input_ids'], tokens_train['attention_mask']],
#     labels_train,
#     epochs=10,
#     batch_size=128,
#     validation_data=([tokens_test['input_ids'], tokens_test['attention_mask']], labels_test),
#     callbacks = [checkpoint_callback],
#     verbose = 1
#     )


# transformer_model.fit(
#     tokens_train['input_ids'],
#     labels_train,
#     epochs=10,
#     batch_size=128,
#     validation_data=(tokens_test['input_ids'], labels_test),
#     callbacks = [checkpoint_callback],
#     verbose = 1
#     )

# model.save(f"{save_path}/transformer_sentiment_analysis.h5")
# Convert the text column to a list of strings

# Convert the word embeddings to a NumPy array
# embeddings = np.array(train_df['word_embeddings'].tolist())

# Determine the maximum sequence length
# max_seq_length = max(len(seq) for seq in embeddings)

# # Pad the sequences to the maximum length
# padded_embeddings = tf.keras.preprocessing.sequence.pad_sequences(embeddings, maxlen=max_seq_length, padding='post')

# Fit the model using the padded embeddings and labels
transformer_model.fit(tf.convert_to_tensor(train_df['text']), tf.convert_to_tensor(labels_train), epochs=num_epochs, batch_size=batch_size)


model.save(f"{save_path}/transformer_sentiment_analysis.h5")

ValueError: ignored

## Preditions

In [None]:
# custom_objects = {'Transformer': Transformer}

# model = tf.keras.models.load_model("/content/content/MyDrive/sentiment_checkpoint.h5",
#                                    custom_objects=custom_objects)


In [None]:
# import transformers

# def prepare_input_data(text):

#   seq_len = 256
#   model_name = 'cardiffnlp/twitter-roberta-base-sentiment'

#   tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding=True, return_tensor='np')

#   tokens = tokenizer(
#       text,
#       max_length=seq_len,
#       truncation=True,
#       padding='max_length',
#       add_special_tokens=True,
#       return_tensors='np'
#   )
#   return [tokens['input_ids'], tokens['attention_mask']]
#   #TODO

In [None]:
# sentiment_mapping = ['Negaitve', 'Neutral', 'Positive']
# text_data = ["This is a positive review.", "I did not like this product.", "The movie was okay.", "I love you"]
# input_data = prepare_input_data(text_data)

# predictions = transformer_model.predict(input_data)

# predicted_labels = tf.argmax(predictions, axis=1)

# for i, text in enumerate(text_data):
#     print(predicted_labels)
#     print(f"Text: {text}")
#     print(f"Predicted Label: {predicted_labels[i]}")
#     print(f"Sentiment: {sentiment_mapping[predicted_labels[i]]}\n")


In [None]:
# import sklearn
# from sklearn.metrics import accuracy_score
# y_pred = model.predict([tokens_test['input_ids'], tokens_test['attention_mask']])
# y_pred = tf.argmax(y_pred)
# print(accuracy_score(y_pred, y_test))
