In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Step 1: Load your dataset
parquet_file_path1 = "/kaggle/input/sentiment-dataset/train-00000-of-00001 (1).parquet"
parquet_file_path2 = "/kaggle/input/sentiment-dataset/train-00000-of-00001.parquet"

df1 = pd.read_parquet(parquet_file_path1)
df2 = pd.read_parquet(parquet_file_path2)
df = pd.concat([df1, df2], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# Step 2: Tokenization
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

# Step 3: Create Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        label = self.dataframe.iloc[idx]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define constants
MAX_LENGTH = 128  # adjust as needed
BATCH_SIZE = 32  # adjust as needed

# Create dataset and dataloader
dataset = CustomDataset(df, tokenizer, max_length=MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Step 4: Define the Model
class MyClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(MyClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("ai4bharat/indic-bert")
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Step 5: Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyClassifier(num_classes=3)  # assuming 3 classes
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(5):
    tqdm_dataloader = tqdm(dataloader, desc=f"Epoch {epoch + 1}")
    for batch in tqdm_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        tqdm_dataloader.set_postfix({'loss': loss.item()})


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Epoch 1:  61%|██████    | 128/209 [40:19<25:18, 18.75s/it, loss=1.09] 

In [None]:
# Step 6: Evaluation (using a separate test dataset)
# Similar to the training loop, but set the model to evaluation mode and don't perform backpropagation

# Step 1: Load your test dataset
test_parquet_file_path = "/kaggle/input/testing/test-00000-of-00001.parquet"
test_df = pd.read_parquet(test_parquet_file_path)

# Step 2: Create DataLoader for testing
test_dataset = CustomDataset(test_df, tokenizer, max_length=MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Step 3: Evaluation Loop
model.eval()
test_accuracy = 0.1
total_test_samples = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(logits, dim=1)
        
        test_accuracy += (predicted_labels == labels).sum().item()
        total_test_samples += labels.size(0)

test_accuracy /= total_test_samples
print(f"Test Accuracy: {test_accuracy}")
