In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import re
import fasttext

In [30]:
fasttext_model = fasttext.load_model('D:/Chipsal/Embeddings/devanagari_fasttext_cbow.bin')

In [31]:
input_size = fasttext_model.get_dimension()
hidden_size = 256
num_layers = 3
num_classes = 2
num_epochs = 10
learning_rate = 0.001

In [32]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, lstm_output):
        scores = self.Va(torch.tanh(self.Wa(lstm_output) + self.Ua(lstm_output)))
        attention_weights = torch.softmax(scores, dim=1)
        
        context_vector = torch.bmm(attention_weights.permute(0, 2, 1), lstm_output)
        return context_vector.squeeze(1), attention_weights.squeeze(2)

class BidirectionalLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BidirectionalLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                            batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)  
        self.fc = nn.Linear(hidden_size * 2, num_classes)  

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) 
        
        out, _ = self.lstm(x, (h0, c0)) 
        

        context_vector, attention_weights = self.attention(out)
     
        out = self.fc(context_vector)
        return out

In [33]:
class DevanagariDataset(Dataset):
    def __init__(self, texts, max_length=50):
        self.texts = texts.tolist()  
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        words = self.texts[idx].split()  
        
        embeddings = [torch.tensor(fasttext_model.get_word_vector(word)) for word in words]
        
        if len(embeddings) > self.max_length:
            embeddings = embeddings[:self.max_length]  
        else:
            padding = [torch.zeros(fasttext_model.get_dimension())] * (self.max_length - len(embeddings))  
            embeddings.extend(padding)
        
        text_tensor = torch.stack(embeddings)

        return text_tensor

In [34]:
test_df = pd.read_csv("../datasets/test.csv")  

In [35]:
def preprocess_text(text):
    pattern = r'[॥।॰،۔؟٪×÷!@#$%^&*()_+={}\[\]:;"\'<>,.?/~`|-]'
    text = re.sub(r'[०१२३४५६७८९0-9]', '', text)
    clean_text = re.sub(pattern, '', text)
    clean_text.strip()
    return clean_text

test_df['tweet'] = test_df['tweet'].apply(preprocess_text)
test_df["tweet"] = test_df['tweet'].str.strip()

In [36]:
test_dataset = DevanagariDataset(test_df["tweet"])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [37]:
model = BidirectionalLSTMModel(input_size, hidden_size, num_layers, num_classes)
model.load_state_dict(torch.load("../models/lstm_attention_model-st2.pth"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BidirectionalLSTMModel(
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, bidirectional=True)
  (attention): Attention(
    (Wa): Linear(in_features=512, out_features=512, bias=True)
    (Ua): Linear(in_features=512, out_features=512, bias=True)
    (Va): Linear(in_features=512, out_features=1, bias=True)
  )
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [38]:
predictions = []
with torch.no_grad():
    for texts in test_loader:
        texts = texts.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

# Save predictions to a DataFrame
test_df['prediction'] = predictions

In [39]:
test_df

Unnamed: 0,index,tweet,prediction
0,10000,मतदान प्रक्रिया के दौरान DmAmethi व SPअमेठी द्...,0
1,10005,यूपी में फीसदी मतदाता पहली बार मतदान करेंगे म...,0
2,10024,DIGSpdeoria द्वारा AssemblyElections के दृष्टि...,0
3,10027,BJP राज्यों में बहुमत से जीतेगी पीएम मोदी\n\n...,0
4,10053,लोकतंत्र के प्रति जगे AssemblyElections,0
...,...,...,...
4071,37146,महोत्तरीको बर्दिबास नगरपालिकाको अहिलेको मतगणना...,0
4072,37148,RaameshKoirala असभ्यअनुशासनहीन र अाडम्बरि हौं ...,0
4073,37153,नतिजा संगै जनमत र जनअपेक्षा बीच एकमना छ अब राष...,0
4074,37167,हाम्रो वडाका एमाले अध्यक्षका उमेदवारले भोट आँ...,1


In [40]:
test_df.drop(columns=['tweet'], inplace=True)

In [41]:
test_df = test_df.sort_values(by="index")
test_df.head()

Unnamed: 0,index,prediction
0,10000,0
1,10005,0
2,10024,0
3,10027,0
4,10053,0


In [42]:
output_file_path = 'output.txt'

# Write the DataFrame to a text file in the desired format
with open(output_file_path, 'w') as f:
    for index, row in test_df.iterrows():
        f.write(f'{{"index": {row["index"]}, "prediction": {row["prediction"]}}}\n')