In [None]:
# Install necessary libraries
# !pip install pandas transformers torch 

In [11]:
import pandas as pd
from transformers import AutoTokenizer
import torch

# Load HateXplain model

In [6]:
import sys

file_directory = '../../'
sys.path.append(file_directory)

from models import *

In [9]:
# Load the hatXplain model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two")
model = Model_Rational_Label.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two",
                                             attn_implementation="eager")

In [None]:
# Check if CUDA is available for GPU acceleration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to get toxicity scores

In [None]:
# Define a function to get toxicity scores
def get_toxicity_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        # Apply softmax to get probabilities for each class
        probs = torch.nn.functional.softmax(logits, dim=-1)
        # We assume class 1 is "toxic" and class 0 is "non-toxic"
        toxicity_score = probs[0][1].item()
    return toxicity_score

# Function to process data by batches
We save only the `index` of the comments and their respective `toxicity_score`.

In [None]:
# Batch processing function
def process_in_batches(input_csv, output_csv, chunk_size=1000, skip_rows=0):
    batch_number = 1
    
    # Read the input CSV in chunks, skipping a certain number of rows
    with pd.read_csv(input_csv, chunksize=chunk_size, skiprows=range(1, skip_rows + 1)) as reader:
        for chunk_idx, chunk in enumerate(reader):
            print(f'Processing batch {chunk_idx + 1}...')

            # Apply toxicity score calculation for each row in the chunk
            chunk['toxicity_score'] = chunk['text'].apply(get_toxicity_score)
            
            # Save only the 'index' and 'toxicity_score' columns
            scores_df = chunk[['index', 'toxicity_score']]
            
            # Write the result to the output CSV file
            if batch_number == 1 and skip_rows == 0:
                scores_df.to_csv(output_csv, index=False, mode='w')  # Write header for the first batch
            else:
                scores_df.to_csv(output_csv, index=False, mode='a', header=False)  # Append mode without header
            
            batch_number += 1
            print(f'Batch {chunk_idx + 1} processed and saved.')

# Call the batch processing function

In [None]:
input_file = '../../combined_data.csv'
output_file = '../../hateXplain_scores.csv'
process_in_batches(input_file, output_file, chunk_size=500, skip_rows=0)