In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm  # Import tqdm

import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
from collections import Counter

In [2]:
df = pd.read_csv('/kaggle/input/checkpoint1/checkpoint1.csv') # replace with your path
df

In [4]:
# Set random seeds for reproducible and consistent results
set_seed(42)

checkpoint = 'siebert/sentiment-roberta-large-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

class ScamDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        return inputs

# Move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
model.eval()  # Set the model to evaluation mode

# Create a Dataset and DataLoader
texts = df['text'].copy()
dataset = ScamDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

predictions = []

with torch.no_grad():
    # Wrap the dataloader with tqdm to track progress
    for batch in tqdm(dataloader, desc="Classifying"):
        # Move batch data to GPU
        inputs = {key: val.squeeze(1).to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).tolist()
        predictions.extend(batch_predictions)

print("Classification complete.")

2024-07-08 21:56:27.807516: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 21:56:27.807614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 21:56:27.945423: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


cuda


Classifying:   0%|          | 14/65440 [00:27<35:04:54,  1.93s/it]


KeyboardInterrupt: 

In [None]:
df['label'] = predictions
df.to_csv('second_preprocess.csv', index=False) # Please send me this file

## Split dataset if it takes too long (irrelevant for you)

In [5]:
df = pd.read_csv('/kaggle/input/checkpoint1/checkpoint1.csv')

In [6]:
split = int(len(df)/2)
print(split)
df1 = df.iloc[:split]
df2 = df.iloc[split:]

1047038


In [11]:
df1.to_csv('senti_split1.csv', index=False)
df2.to_csv('senti_split2.csv', index=False)
