In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm  

import numpy as np 
import pandas as pd 
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed

# Set random seeds for reproducible and consistent results
set_seed(42)


In [None]:
# Change paths here
input = "kaggle_environment_input_path"
output = "senti_split_11"

df = pd.read_csv(input)


In [None]:
# Load models
checkpoint = 'siebert/sentiment-roberta-large-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)


In [None]:
# Define dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        return inputs


In [None]:
# Move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
model.eval()  # Set the model to evaluation mode

# Create a Dataset and DataLoader
texts = df['text'].copy()
dataset = SentimentDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

predictions = []

with torch.no_grad():
    # Wrap the dataloader with tqdm to track progress
    for batch in tqdm(dataloader, desc="Classifying"):
        # Move batch data to GPU
        inputs = {key: val.squeeze(1).to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).tolist()
        predictions.extend(batch_predictions)

print("Classification complete.")


In [None]:

# Add predictions to DataFrame
df['label'] = predictions
df.to_csv(output)