In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import numpy as np
import pandas as pd
import os


## Import the necessary libraries

In [21]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset

## Load the dataset from the directory and save it into the file

In [22]:
df = pd.read_csv('/content/drive/MyDrive/Twitter_Data.csv')

In [23]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [24]:
df.shape

(162980, 2)

In [25]:
df = df.iloc[:5000,:]

In [26]:
df.shape

(5000, 2)

## Check for null values

In [27]:
df.isna().sum()

Unnamed: 0,0
clean_text,1
category,0


## Drop the null values in the dataset

In [28]:
df = df.dropna()

In [29]:
df.isna().sum()

Unnamed: 0,0
clean_text,0
category,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4999 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  4999 non-null   object 
 1   category    4999 non-null   float64
dtypes: float64(1), object(1)
memory usage: 117.2+ KB


In [31]:
df.describe()

Unnamed: 0,category
count,4999.0
mean,0.206441
std,0.773841
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Preprocess and make tokenize into the text

In [36]:
!pip install nltk
from nltk.corpus import stopwords
import re



In [42]:
!pip install nltk
from nltk.corpus import stopwords
import re
import nltk # make sure to import nltk

nltk.download('stopwords') # Download the stopwords resource

def func(x):
    x = re.sub("[^a-zA-Z]"," ",x)
    x = x.split()
    x = [i.strip() for i in x if i not in set(stopwords.words('english'))]
    return " ".join(x)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [43]:
df['clean_text'] = df['clean_text'].astype('str')
df.clean_text = df.clean_text.apply(func)

In [44]:
df.category = df.category.astype('int')

In [46]:
df.head()

Unnamed: 0,clean_text,category
0,modi promised minimum government maximum gover...,-1
1,talk nonsense continue drama vote modi,0
2,say vote modi welcome bjp told rahul main camp...,1
3,asking supporters prefix chowkidar names modi ...,1
4,answer among powerful world leader today trump...,1


## Build the Vocabulary from the dataset

In [47]:
from collections import Counter

In [48]:
vals = df.clean_text.to_list()
texts = [i.split(" ") for i in vals]

In [49]:
def build_vocab(tokenized_data):
    all_words = [word for sentence in tokenized_data for word in sentence]
    word_counts = Counter(all_words)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    vocab.update({word: idx + 2 for idx, (word, _) in enumerate(word_counts.items())})
    return vocab

In [50]:
vocab = build_vocab(texts)

## Make the text into the encoded format

In [51]:
def encode_text(tokenized_data, vocab):
    encoded_data = []
    for sentence in tokenized_data:
        encoded_sentence = [vocab.get(word, vocab['<UNK>']) for word in sentence]
        encoded_data.append(encoded_sentence)
    return encoded_data

# Encode the tokenized sentences
encoded_data = encode_text(texts, vocab)
print("Encoded data:", encoded_data[0])


Encoded data: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 13, 18, 19, 20, 21]


## Make padding the data

In [52]:
import torch
from torch.nn.utils.rnn import pad_sequence

sequence_lengths = [len(seq) for seq in encoded_data]
# print("Original sequence lengths:", sequence_lengths)
encoded_tensors = [torch.tensor(seq) for seq in encoded_data]
padded_data = pad_sequence(encoded_tensors, batch_first=True, padding_value=vocab['<PAD>'])
# print("Padded data:\n", padded_data)

## Take away the target

In [53]:
target = df.category

## Build the sentiment analysis model

In [54]:
class LSTMSentimentModel(nn.Module):
    def __init__(self,vocab_size,embed_dim,hid_dim,out_dim):
        super(LSTMSentimentModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim,padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embed_dim,hid_dim,batch_first=True)
        self.fc = nn.Linear(hid_dim,out_dim)

    def forward(self,text,lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,lengths,batch_first=True,enforce_sorted=False)
        packed_output,(hidden,cell) = self.lstm(packed_embedded)
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [55]:
labels = torch.tensor(target,dtype=torch.float)

## Make the dataset from the sequence lengths and make it into the dataset

In [56]:
lengths_tensor = torch.tensor(sequence_lengths)
dataset = TensorDataset(padded_data,lengths_tensor,labels)

In [57]:
dataloader = DataLoader(dataset,batch_size=2,shuffle=True)

## Define parameters for the model

In [58]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

## Make the sentiment analysis model

In [59]:
model = LSTMSentimentModel(vocab_size,embedding_dim,hidden_dim,output_dim)

## Make the criterion for loss calcultion and optimizer for backpropagation and update the weights

In [60]:
citerion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

## Iterate over the epochs, calculated loss and make optimize the value

In [61]:
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for text_batch,lengths_batch,labels_batch in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch,lengths_batch).squeeze(1)
        loss = citerion(pred,labels_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1} , Loss :{ avg_loss: .4f}")

Epoch 1 , Loss : 0.5262
Epoch 2 , Loss : 0.4019
Epoch 3 , Loss :-1.6291
Epoch 4 , Loss :-17.8948
Epoch 5 , Loss :-49.6362


## Sample text for the trained model prediction

In [62]:
text = "i look very good "
text = [vocab.get(i.lower(),vocab["<UNK>"]) for i in text.split(" ")]
sss = len(text)
text = torch.tensor(text).unsqueeze(0)
# print(text)
# padd = pad_sequence(text, padding_value=vocab['<PAD>'])

In [63]:
with torch.no_grad():
    op = model(text,lengths=[sss])

In [64]:
op = torch.sigmoid(op).round()

In [65]:
op

tensor([[1.]])

In [66]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
  for text_batch, lengths_batch, labels_batch in dataloader:
    pred = model(text_batch, lengths_batch).squeeze(1)
    pred = torch.sigmoid(pred).round().cpu().numpy()
    all_preds.extend(pred)
    all_labels.extend(labels_batch.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.6823
