In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
#import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

  plt.style.use("seaborn-whitegrid")


<Figure size 640x480 with 0 Axes>

In [None]:
df = pd.read_csv('Twitter_Data.csv',encoding='ISO-8859-1')

In [None]:
df.head()

Unnamed: 0,tweets,sentiments
0,when modi promised âminimum government maxim...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tweets      134280 non-null  object
 1   sentiments  134282 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [None]:
df

Unnamed: 0,tweets,sentiments
0,when modi promised âminimum government maxim...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive
...,...,...
162975,why these 456 crores paid neerav modi not reco...,Negative
162976,dear rss terrorist payal gawar what about modi...,Negative
162977,did you cover her interaction forum where she ...,Neutral
162978,there big project came into india modi dream p...,Neutral


In [None]:
df = df[['tweets','sentiments']]

In [None]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [None]:
'''text_len = []
for text in df.tweets:
    tweet_len = len(text.split())
    text_len.append(tweet_len)'''
text_len = []
for text in df.tweets:
    if isinstance(text, str):
        tweet_len = len(text.split())
        text_len.append(tweet_len)
    else:
        text_len.append(0)  # Or any other appropriate value for missing text

In [None]:
df['text_len'] = text_len

In [None]:
print(f" DF SHAPE: {df.shape}")

 DF SHAPE: (162980, 3)


In [None]:
df

Unnamed: 0,tweets,sentiments,text_len
0,when modi promised âminimum government maxim...,Negative,33
1,talk all the nonsense and continue all the dra...,Neutral,13
2,what did just say vote for modi welcome bjp t...,Positive,22
3,asking his supporters prefix chowkidar their n...,Positive,34
4,answer who among these the most powerful world...,Positive,14
...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,Negative,18
162976,dear rss terrorist payal gawar what about modi...,Negative,36
162977,did you cover her interaction forum where she ...,Neutral,9
162978,there big project came into india modi dream p...,Neutral,13


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df = df.iloc[12:]
df.head()

Unnamed: 0,tweets,sentiments,text_len
12,calm waters wheres the modi wave,Positive,6
13,one vote can make all the difference anil kapo...,Neutral,21
14,one vote can make all the difference anil kapo...,Neutral,19
15,vote such party and leadershipwho can take fas...,Negative,20
16,vote modi who has not created jobs,Neutral,7


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df

Unnamed: 0,tweets,sentiments,text_len
0,income guarantee scheme will remonetise what d...,Neutral,7
1,this the unfortunate paradox modi rule their s...,Negative,37
2,,,0
3,and yes supply your proof âPositiveNeutralNe...,Negative,43
4,cancel bullet train lakh saved\nmodi global fa...,Negative,38
...,...,...,...
162963,only nehru scientists modi thank nehru that yo...,Positive,16
162964,live nirav modi threatened kill witness says p...,Positive,8
162965,mrroshan mrmodi will not stop with abhinandanh...,Neutral,14
162966,modi our not loudmouth and liar please sanitiz...,Neutral,10


In [None]:
df.drop(columns=['text_len'],inplace=True)

In [None]:
df

Unnamed: 0,tweets,sentiments
0,income guarantee scheme will remonetise what d...,Neutral
1,this the unfortunate paradox modi rule their s...,Negative
2,,
3,and yes supply your proof âPositiveNeutralNe...,Negative
4,cancel bullet train lakh saved\nmodi global fa...,Negative
...,...,...
162963,only nehru scientists modi thank nehru that yo...,Positive
162964,live nirav modi threatened kill witness says p...,Positive
162965,mrroshan mrmodi will not stop with abhinandanh...,Neutral
162966,modi our not loudmouth and liar please sanitiz...,Neutral


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162968 entries, 0 to 162967
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tweets      134268 non-null  object
 1   sentiments  134270 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [None]:
missing_values = df.isna().sum()
missing_values

tweets        28700
sentiments    28698
dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# removing duplicates

#df.drop_duplicates(inplace = True)

# Remove the duplicate rows
#df = df.drop_duplicates()

df['tweets'] = df['tweets'].astype(str)

from nltk.corpus import stopwords

stopw = set(stopwords.words("english"))

print(stopw)
df["tweets"] = df['tweets'].apply(lambda x: ' '.join(
    [word.lower() for word in x.split() if word.lower() not in stopw]))


def convert_list_to_str(l):
    st = ""
    for i in l:
        st = st+i+" "
    st = st[:-1]
    return st

from nltk.tokenize import RegexpTokenizer


tokenizer = RegexpTokenizer(r'\w+|\d+')
df["tweets_new"] = df["tweets"].apply(tokenizer.tokenize)
df["tweets_new"] = df["tweets_new"].apply(convert_list_to_str)


df

{'a', 'before', "you're", 'yours', 'they', 'll', "hadn't", 'if', 'because', 'were', 'as', 'no', 'i', 'its', 'the', 'those', "shouldn't", 'being', "haven't", 'under', 'yourselves', 'are', 'didn', 'off', 'aren', 'can', 'these', 'there', 'while', "mustn't", 'was', 'doing', 'isn', 'between', 'after', "didn't", 'just', 'any', 'itself', 'on', 'will', 'did', 'her', 'of', 'she', 'it', 'own', "aren't", 'when', 'in', 'once', "wasn't", 'is', 'ma', 'been', 'against', 'over', 'out', 'myself', 'some', 'his', 'himself', 'about', 'only', 'by', 'mightn', "weren't", 'that', "hasn't", 'wouldn', 'each', "isn't", 'them', "it's", 'too', 'should', 'again', 'same', 'haven', 'than', 'don', 'through', 'how', 'you', 'needn', 're', "won't", 'why', "should've", 'won', 'nor', 'both', 'so', 'what', 'not', 'whom', 'o', 'all', 'below', 'for', 'down', 'more', "she's", 'we', 'had', 'mustn', 's', 'where', 'who', 'here', 'to', 'my', 'up', 'few', 'him', 'themselves', 'your', 'am', 'shan', 'such', 've', 'ours', 'doesn', 'ai

Unnamed: 0,tweets,sentiments,tweets_new
0,income guarantee scheme remonetise demonetised,Neutral,income guarantee scheme remonetise demonetised
1,unfortunate paradox modi rule social media mac...,Negative,unfortunate paradox modi rule social media mac...
2,,,
3,yes supply proof âpositiveneutralneutrals ai...,Negative,yes supply proof â positiveneutralneutrals air...
4,cancel bullet train lakh saved modi global fan...,Negative,cancel bullet train lakh saved modi global fan...
...,...,...,...
162963,nehru scientists modi thank nehru born living ...,Positive,nehru scientists modi thank nehru born living ...
162964,live nirav modi threatened kill witness says p...,Positive,live nirav modi threatened kill witness says p...
162965,mrroshan mrmodi stop abhinandanhe brings back ...,Neutral,mrroshan mrmodi stop abhinandanhe brings back ...
162966,modi loudmouth liar please sanitize reportings,Neutral,modi loudmouth liar please sanitize reportings


In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no mapping found


def lemmatize_sentence(sentence):
    # tokenizer = RegexpTokenizer(r'\w+')
    # words = word_tokenize(sentence)
    words = sentence.split()
    pos_tags = nltk.pos_tag(words)
    lemmatizer = WordNetLemmatizer()

    lemmatized_words = [lemmatizer.lemmatize(
        word, get_wordnet_pos(pos_tag)) for word, pos_tag in pos_tags]
    return ' '.join(lemmatized_words)


def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df["tweets_new"] = df["tweets_new"].apply(lemmatize_sentence)
print(df)

df["tweets"] = df["tweets_new"]

# df["tweets"] = df["tweets_new"].apply(convert_list_to_str)
df = df.drop(columns=["tweets_new"])
# print(df)

#fill 0 in mum
df['tweets'] = df['tweets'].fillna(0)

# Save the dataframe
df.to_csv("tweet_clean.csv", index=False)
df


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                   tweets sentiments  \
0          income guarantee scheme remonetise demonetised    Neutral   
1       unfortunate paradox modi rule social media mac...   Negative   
2                                                     nan        NaN   
3       yes supply proof âpositiveneutralneutrals ai...   Negative   
4       cancel bullet train lakh saved modi global fan...   Negative   
...                                                   ...        ...   
162963  nehru scientists modi thank nehru born living ...   Positive   
162964  live nirav modi threatened kill witness says p...   Positive   
162965  mrroshan mrmodi stop abhinandanhe brings back ...    Neutral   
162966     modi loudmouth liar please sanitize reportings    Neutral   
162967  âmodi usual likes take credit everything â...   Negative   

                                               tweets_new  
0           income guarantee scheme remonetise demonetise  
1       unfortu

Unnamed: 0,tweets,sentiments
0,income guarantee scheme remonetise demonetise,Neutral
1,unfortunate paradox modi rule social medium ma...,Negative
2,,
3,yes supply proof â positiveneutralneutrals air...,Negative
4,cancel bullet train lakh save modi global fanc...,Negative
...,...,...
162963,nehru scientist modi thank nehru bear living h...,Positive
162964,live nirav modi threaten kill witness say pros...,Positive
162965,mrroshan mrmodi stop abhinandanhe bring back e...,Neutral
162966,modi loudmouth liar please sanitize reporting,Neutral


In [None]:
df.duplicated().sum()

31836

In [None]:
!pip install transformers



In [None]:
!pip install tensorflow



In [None]:
import re
import numpy as np
# import emoji as emoji
import string
import pandas as pd
from transformers import BertTokenizer

In [None]:
def data_process(data, labels):
    input_ids = []
    attention_masks = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for sentence in data:
        bert_inp = bert_tokenizer.__call__(sentence, max_length=36,
                                           padding='max_length', pad_to_max_length=True,
                                           truncation=True, return_token_type_ids=False)

        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels

In [None]:
df_copy = df.copy()
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

# Every single column with categorical values will be converted.
object_cols = ['sentiments']
df_copy[object_cols] = df_copy[object_cols].astype(str)

df_copy[object_cols] = ordinal_encoder.fit_transform(df_copy[object_cols])

df_copy.head()

ordinal_encoder = OrdinalEncoder()

# Every single column with categorical values will be converted.
object_cols = ['tweets', 'sentiments']
df[object_cols] = df[object_cols].astype(str)

df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])

df.head()

Unnamed: 0,tweets,sentiments
0,44304.0,1.0
1,119154.0,0.0
2,79087.0,3.0
3,128762.0,0.0
4,13896.0,0.0


In [None]:
df_copy.head()

Unnamed: 0,tweets,sentiments
0,income guarantee scheme remonetise demonetise,1.0
1,unfortunate paradox modi rule social medium ma...,0.0
2,,3.0
3,yes supply proof â positiveneutralneutrals air...,0.0
4,cancel bullet train lakh save modi global fanc...,0.0


In [None]:
df_copy["tweets"] = df_copy["tweets"].astype(str)
input_ids, attention_masks, labels = data_process(df_copy["tweets"], df_copy["sentiments"])


input_ids.shape

(162968, 36)

In [None]:
import gc

import torch
from torch import nn
from transformers import BertModel


class BERT_CNN(nn.Module):

    def __init__(self, num_classes):
        super(BERT_CNN, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.conv = nn.Conv2d(in_channels=1, out_channels=13, kernel_size=(3, 768), padding=(1, 0))
        self.bn = nn.BatchNorm2d(13)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=(3, 1), stride=1)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(442, num_classes)
        self.flat = nn.Flatten()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        cls_hs = self.bert(input_ids=sent_id, attention_mask=mask, return_dict=False, output_hidden_states=True)
        x = cls_hs[0].unsqueeze(1)
        x = self.conv(x)
        x = self.bn(x)
        del cls_hs
        gc.collect()
        torch.cuda.empty_cache()
        x = self.relu(x)
        x = self.pool(x)
        x = self.flat(x)
        x = self.fc(x)
        return self.softmax(x)

In [None]:
input_ids.shape

(162968, 36)

In [None]:
dataset = pd.DataFrame({'input_ids': list(input_ids), 'attention_masks': list(attention_masks)}, columns=['input_ids', 'attention_masks'])
dataset

Unnamed: 0,input_ids,attention_masks
0,"[101, 3318, 11302, 5679, 2128, 8202, 20624, 33...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
1,"[101, 15140, 20506, 16913, 2072, 3627, 2591, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[101, 16660, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[101, 2748, 4425, 6947, 1037, 3893, 2638, 4904...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[101, 17542, 7960, 3345, 2474, 10023, 3828, 16...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
162963,"[101, 23556, 7155, 16913, 2072, 4067, 23556, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
162964,"[101, 2444, 9152, 2527, 2615, 16913, 2072, 156...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
162965,"[101, 2720, 7352, 4819, 2720, 5302, 4305, 2644...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
162966,"[101, 16913, 2072, 5189, 14359, 16374, 3531, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


In [None]:
dataset.columns

Index(['input_ids', 'attention_masks'], dtype='object')

In [None]:
labels = labels.astype(np.int64)
labels

array([1, 0, 3, ..., 1, 1, 0])

In [None]:
(labels.astype(np.int64)).dtype

dtype('int64')

In [None]:
df

Unnamed: 0,tweets,sentiments
0,44304.0,1.0
1,119154.0,0.0
2,79087.0,3.0
3,128762.0,0.0
4,13896.0,0.0
...,...,...
162963,81651.0,2.0
162964,54325.0,2.0
162965,78042.0,1.0
162966,69392.0,1.0


In [None]:
import gc
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoModel
import pandas as pd

train_text, temp_text, train_labels, temp_labels = train_test_split(dataset, labels,
                             random_state=2018, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=2018, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()
train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)

# import BERT-base pre-trained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


batch_size = 128

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# Random sampling
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# Validation sampler
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# freeze parameters in bert
for param in bert.parameters():
    param.requires_grad = False

# model defination
model = BERT_CNN(num_classes=4)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

#Loss definition
cross_entropy = nn.CrossEntropyLoss()

# set initial loss to infinite
best_valid_loss = float('inf')


# Accuracy Functions
def get_accuracy_per_batch(oglabels, predlabels):
  count = 0
  for i in range(len(oglabels)):
    if oglabels[i] == predlabels[i]:
      count+=1

  return count/len(oglabels)

def get_total_accuracy(acc_list):
  return sum(acc_list)/len(acc_list)


# function to train the model
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
    accuracy = 0
    total = len(train_dataloader)
    acc_list = []
    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}, accuracy={accuracy}', end='')
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # clear previously calculated gradients
        model.zero_grad()
        # get model predictions for the current batch

        preds = model(sent_id.to(device).long(), mask)
        predicted_labels = torch.argmax(preds, dim=1)

        # print(f"Predictions are : {predicted_labels}")
        # print(f"Labels are : {labels}")
        accuracy = get_accuracy_per_batch(labels, predicted_labels)
        acc_list.append(accuracy)
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        # print(f"Loss {loss}, loss item: {loss.item}")
        # add on to the total loss
        total_loss += float(loss.item())
        # backward pass to calculate the gradients
        loss.backward()
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        # model predictions are stored on GPU. So, push it to CPU
        # append the model predictions
        total_preds.append(preds.detach().cpu().numpy())
        # break

    gc.collect()
    torch.cuda.empty_cache()

    # compute the training loss of the epoch
    avg_loss = total_loss / (len(train_dataloader)*batch_size)

    # avg accuracy
    total_accuracy = get_total_accuracy(acc_list)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # returns the loss and predictions
    return avg_loss, total_preds, total_accuracy



# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss, total_accuracy = 0, 0
    accuracy = 0
    total_preds = []
    acc_list = []
    # iterate over batches
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}, accuracy={accuracy}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # deactivate autograd
        with torch.no_grad():
            # model predictions
            preds = model(sent_id, mask)
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())
            total_preds.append(preds.detach().cpu().numpy())
            predicted_labels = torch.argmax(preds, dim=1)


            accuracy = get_accuracy_per_batch(labels, predicted_labels)
            acc_list.append(accuracy)

    gc.collect()
    torch.cuda.empty_cache()
    # compute the validation loss of the epoch
    avg_loss = total_loss / (len(val_dataloader)*batch_size)
    # avg accuracy
    total_accuracy = get_total_accuracy(acc_list)
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds, total_accuracy


print(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  train_seq = torch.tensor(train_text['input_ids'].tolist())


cuda


In [None]:
import gc
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

best_accuracy = 0.0  # Initialize the best accuracy to a low value
best_model_state = None  # Store the state of the best model

# Define the number of epochs
epochs = 5
current = 1

# Training loop
while current <= epochs:
    print(f'\nEpoch {current} / {epochs}:')

    # Train model
    train_loss, _, train_acc = train()

    # Evaluate model
    valid_loss, _, valid_acc = evaluate()

    # Check if the current epoch's accuracy is the best so far
    if valid_acc > best_accuracy:
        best_accuracy = valid_acc
        best_model_state = model.state_dict()

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')
    print(f'\n\nTraining Accuracy: {train_acc:.3f}')
    print(f'Validation Accuracy: {valid_acc:.3f}')

    current = current + 1

# Save the model with the best accuracy
if best_model_state is not None:
    torch.save(best_model_state, 'bert_cnn_model.pth')

# Get predictions for test data
gc.collect()
torch.cuda.empty_cache()


Epoch 1 / 5:
Batch 1019/1019 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.00, accuracy=0.921875

Evaluating...
Batch 128/128 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.00, accuracy=0.8828125

Training Loss: 0.003
Validation Loss: 0.003


Training Accuracy: 0.835
Validation Accuracy: 0.883

Epoch 2 / 5:
Batch 1019/1019 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.00, accuracy=0.8984375

Evaluating...
Batch 128/128 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.00, accuracy=0.8828125

Training Loss: 0.002
Validation Loss: 0.002


Training Accuracy: 0.894
Validation Accuracy: 0.891

Epoch 3 / 5:
Batch 1019/1019 |██████████████████████████████████████████████████

In [None]:
if best_model_state is not None:
    model.load_state_dict(best_model_state)

with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    preds = []    # List to store predictions

    # Iterate over batches
    for batch_seq, batch_mask in zip(test_seq.split(batch_size), test_mask.split(batch_size)):
        batch_seq = batch_seq.to(device)
        batch_mask = batch_mask.to(device)

        # Compute predictions
        batch_preds = model(batch_seq, batch_mask)
        batch_preds = batch_preds.detach().cpu().numpy()

        preds.extend(batch_preds)

print("Performance:")
# Model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))
print("Accuracy: " + str(accuracy_score(test_y, preds)))

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.80      0.80      2915
           1       0.88      0.89      0.89      4548
           2       0.89      0.89      0.89      5964
           3       1.00      1.00      1.00      2870

    accuracy                           0.89     16297
   macro avg       0.89      0.89      0.89     16297
weighted avg       0.89      0.89      0.89     16297

Accuracy: 0.8922501073817267


In [None]:
import torch
import gradio as gr
import gc

# Define your model architecture
# Assuming you have defined your model class somewhere
# Here I'll just create a placeholder class
class YourModel(torch.nn.Module):
    def __init__(self):
        super(YourModel, self).__init__()
        # Define your model layers here

    def forward(self, x):
        # Implement forward pass
        pass

# Load the model from the .pth file
model = YourModel()
model.load_state_dict(torch.load('bert_cnn_model.pth'))
model.eval()  # Set the model to evaluation mode

# Define a function to make predictions
def predict(text):
    # Preprocess the input text if needed
    # Convert text to tensor
    tensor_input = torch.tensor(text)  # Example: Convert text to tensor if needed
    # Pass the tensor through the model to get predictions
    with torch.no_grad():
        output = model(tensor_input)
    # Post-process the output if needed
    # Return the prediction result
    return output

# Create a Gradio interface
inputs = gr.inputs.Textbox(lines=5, label="Input Text")
outputs = gr.outputs.Textbox(label="Output Prediction")

# Create Gradio app
app = gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title="Your Model Prediction App",
                   description="Enter text and get predictions.")
app.launch()
