In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [23]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)  
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))  
        out = out[:, -1, :]  # Take the output from the last time step
        out = self.fc(out)
        return out

In [24]:
class DevanagariDataset(Dataset):
    def __init__(self, texts, max_length=100):
        self.texts = texts.tolist()  
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text_tensor = torch.tensor([ord(char) for char in self.texts[idx]], dtype=torch.long)
        
        if len(text_tensor) > self.max_length:
            text_tensor = text_tensor[:self.max_length]  
        else:
            padding = torch.zeros(self.max_length - len(text_tensor), dtype=torch.long)  
            text_tensor = torch.cat((text_tensor, padding)) 

        return text_tensor

In [25]:
test_df = pd.read_csv("../datasets/test.csv")  # Adjust the path as necessary
test_dataset = DevanagariDataset(test_df["text"])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [26]:
input_size = 1  # Each character is represented by a single integer
hidden_size = 128  
num_layers = 2  
num_classes = 5
model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
model.load_state_dict(torch.load("../models/lstm_model.pth"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (lstm): LSTM(1, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)

In [27]:
predictions = []
with torch.no_grad():
    for texts in test_loader:
        texts = texts.to(device)
        texts = texts.unsqueeze(-1).float()  
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

# Save predictions to a DataFrame
test_df['prediction'] = predictions

In [28]:
test_df

Unnamed: 0,index,text,prediction
0,10004,त्यो आँ गरेको अन्वार मा झिँगा पस्न सक्छ है? हि...,0
1,10005,निर्वाचन परिणाम ले बल्ल बुद्धि आयो?,1
2,10007,पोखराको मेयर उम्मेदवारी दिन कृष्ण थापाले दिए स...,0
3,10011,दलित महिला सदस्यमा उम्मेदवारी परेन,0
4,10018,पार्टी निर्णय बिपरीत उम्मेद्वारी दिने दिनेलाई ...,1
...,...,...,...
11229,84860,राज्य सरकारला पाच वर्षे होत आले आहेत मात्र हे ...,3
11230,84862,"महाविकास आघाडी बाबुभैया, “हेराफेरी”तो है!राज्य...",1
11231,84865,एकात्मिक बालविकास कार्यक्रमाअंतर्गत कोल्हापूर ...,2
11232,84876,अत्यल्प भाव मिळालेल्या शेतकऱ्यांना किमान कोटी ...,1


In [29]:
test_df.drop(columns=['text'], inplace=True)

In [37]:
test_df = test_df.sort_values(by="index")
test_df.head()

Unnamed: 0,index,prediction
0,10004,0
1,10005,1
2,10007,0
3,10011,0
4,10018,1


In [38]:
output_file_path = 'output.txt'

# Write the DataFrame to a text file in the desired format
with open(output_file_path, 'w') as f:
    for index, row in test_df.iterrows():
        f.write(f'{{"index": {row["index"]}, "prediction": {row["prediction"]}}}\n')