In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import re
import fasttext

In [24]:
df = pd.read_csv("datasets/trainv1.csv", index_col=0)
fasttext_model = fasttext.load_model('../Embeddings/devanagari_fasttext_cbow.bin')
df.head()

Unnamed: 0,index,tweet,label
0,10001,अमर उजाला दैनिक समाचार में हेड लाइन में प्रकाश...,0
1,10002,बीजेपी @BJP4India केन्द्रीय चुनाव समिति की मीट...,0
2,10003,#AssemblyElections2022 प्रभारी निरीक्षक गौर श्...,0
3,10004,#ब्रेकिंग - मध्यप्रदेश सीहोर में हैवानियत 10 स...,0
4,10006,#AmitShah ने कहा उत्तर प्रदेश में पूर्ववर्ती स...,0


In [27]:
df_0 = df[df['label'] == 0]  # Class 0
df_1 = df[df['label'] == 1]  # Class 1

In [28]:
df_0_downsampled = df_0.sample(len(df_1), random_state=42, replace=False)

In [29]:
df_balanced = pd.concat([df_0_downsampled, df_1])

In [30]:
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [32]:
df = df_balanced
df["label"].value_counts()

label
0    7942
1    7942
Name: count, dtype: int64

In [33]:
df.dropna(inplace = True)

In [34]:
df.columns

Index(['index', 'tweet', 'label'], dtype='object')

In [35]:
def preprocess_text(text):
    pattern = r'[॥।॰،۔؟٪×÷!@#$%^&*()_+={}\[\]:;"\'<>,.?/~`|-]'
    text = re.sub(r'[०१२३४५६७८९0-9]', '', text)
    clean_text = re.sub(pattern, '', text)
    clean_text.strip()
    return clean_text

df['tweet'] = df['tweet'].apply(preprocess_text)
df["tweet"] = df['tweet'].str.strip()

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df["tweet"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

In [37]:
y_test.value_counts()

label
0    1589
1    1588
Name: count, dtype: int64

In [38]:
class DevanagariDataset(Dataset):
    def __init__(self, texts, labels, max_length=50):
        self.texts = texts.tolist()  
        self.labels = labels.tolist()  
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        words = self.texts[idx].split()  
        
        embeddings = [torch.tensor(fasttext_model.get_word_vector(word)) for word in words]
        
        if len(embeddings) > self.max_length:
            embeddings = embeddings[:self.max_length]  # Truncate
        else:
            padding = [torch.zeros(fasttext_model.get_dimension())] * (self.max_length - len(embeddings))  
            embeddings.extend(padding)
        
        text_tensor = torch.stack(embeddings)

        # Get label tensor
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return text_tensor, label_tensor


In [39]:
train_dataset = DevanagariDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = DevanagariDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [41]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, lstm_output):
        scores = self.Va(torch.tanh(self.Wa(lstm_output) + self.Ua(lstm_output)))
        attention_weights = torch.softmax(scores, dim=1)
        
        context_vector = torch.bmm(attention_weights.permute(0, 2, 1), lstm_output)
        return context_vector.squeeze(1), attention_weights.squeeze(2)

class BidirectionalLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BidirectionalLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                            batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)  
        self.fc = nn.Linear(hidden_size * 2, num_classes)  

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) 
        
        out, _ = self.lstm(x, (h0, c0)) 
        

        context_vector, attention_weights = self.attention(out)
     
        out = self.fc(context_vector)
        return out

In [42]:
input_size = fasttext_model.get_dimension()
hidden_size = 256
num_layers = 3
num_classes = len(df['label'].unique())  
num_epochs = 10
learning_rate = 0.001

In [43]:
model = BidirectionalLSTMModel(input_size, hidden_size, num_layers, num_classes)
model.to(device)

BidirectionalLSTMModel(
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, bidirectional=True)
  (attention): Attention(
    (Wa): Linear(in_features=512, out_features=512, bias=True)
    (Ua): Linear(in_features=512, out_features=512, bias=True)
    (Va): Linear(in_features=512, out_features=1, bias=True)
  )
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [44]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        correct_train_predictions = 0
        
        for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training", leave=False):
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)  # No unsqueeze here
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            total_train_loss += loss.item()
            
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs.data, 1)
            correct_train_predictions += (predicted == labels).sum().item()
        
        train_accuracy = correct_train_predictions / len(train_loader.dataset)
        train_loss = total_train_loss / len(train_loader)
        
        model.eval()
        total_test_loss = 0
        correct_test_predictions = 0
        
        with torch.no_grad():
            for texts, labels in tqdm(test_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Testing", leave=False):
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)  # No unsqueeze here
                loss = criterion(outputs, labels)
                total_test_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                correct_test_predictions += (predicted == labels).sum().item()
        
        test_accuracy = correct_test_predictions / len(test_loader.dataset)
        test_loss = total_test_loss / len(test_loader)
        
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [45]:
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs)
torch.save(model.state_dict(), "lstm_attention_model-st2.pth")

                                                                        

Epoch 1/10
Train Loss: 0.4810, Train Accuracy: 0.7769
Test Loss: 0.4470, Test Accuracy: 0.7954


                                                                        

Epoch 2/10
Train Loss: 0.4205, Train Accuracy: 0.8067
Test Loss: 0.4011, Test Accuracy: 0.8096


                                                                        

Epoch 3/10
Train Loss: 0.3995, Train Accuracy: 0.8134
Test Loss: 0.3924, Test Accuracy: 0.8193


                                                                        

Epoch 4/10
Train Loss: 0.3882, Train Accuracy: 0.8237
Test Loss: 0.3858, Test Accuracy: 0.8237


                                                                        

Epoch 5/10
Train Loss: 0.3712, Train Accuracy: 0.8342
Test Loss: 0.3901, Test Accuracy: 0.8193


                                                                        

Epoch 6/10
Train Loss: 0.3483, Train Accuracy: 0.8464
Test Loss: 0.3865, Test Accuracy: 0.8253


                                                                        

Epoch 7/10
Train Loss: 0.3338, Train Accuracy: 0.8525
Test Loss: 0.3929, Test Accuracy: 0.8275


                                                                        

Epoch 8/10
Train Loss: 0.3227, Train Accuracy: 0.8596
Test Loss: 0.3892, Test Accuracy: 0.8275


                                                                        

Epoch 9/10
Train Loss: 0.2956, Train Accuracy: 0.8749
Test Loss: 0.4177, Test Accuracy: 0.8200


                                                                         

Epoch 10/10
Train Loss: 0.2722, Train Accuracy: 0.8879
Test Loss: 0.4476, Test Accuracy: 0.8215


