In [1]:
import torch
import pandas as pd
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import json
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import os

In [2]:
# Check if a GPU is available and set PyTorch to use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
def save_checkpoint(predictions, current_index, checkpoint_path):
    with open(checkpoint_path, 'w') as file:
        json.dump({'last_index': current_index, 'predictions': predictions}, file)

In [4]:
def load_checkpoint(checkpoint_path):
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'r') as file:
            return json.load(file)
    return None

In [5]:
classifier = transformers.pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-base")

text = "suddenlywestan country mentally ill terrorist whoever say different enabler ignorant blind endlesscountless war aggression death rate american terrorist act fuck american scum"
labels = ['No Hate Speech', 'Mild Hate Speech', 'Severe Hate Speech']

prediction = classifier(text, labels)

print(prediction)
print(prediction['sequence'])
print(prediction['labels'])
print(prediction['scores'])

{'sequence': 'suddenlywestan country mentally ill terrorist whoever say different enabler ignorant blind endlesscountless war aggression death rate american terrorist act fuck american scum', 'labels': ['Severe Hate Speech', 'Mild Hate Speech', 'No Hate Speech'], 'scores': [0.6529615521430969, 0.18778380751609802, 0.15925461053848267]}
suddenlywestan country mentally ill terrorist whoever say different enabler ignorant blind endlesscountless war aggression death rate american terrorist act fuck american scum
['Severe Hate Speech', 'Mild Hate Speech', 'No Hate Speech']
[0.6529615521430969, 0.18778380751609802, 0.15925461053848267]


In [6]:
# Load the CSV file
df = pd.read_csv('ucberkeley_measuring_hate_speech_dataset_testing.csv')

In [7]:
checkpoint_path = 'deberta_sequential_checkpoint.json'
checkpoint_data = load_checkpoint(checkpoint_path)

In [8]:
# Define your labels
labels = ['No Hate Speech', 'Mild Hate Speech', 'Severe Hate Speech']
label_mapping = {'No Hate Speech': 0, 'Maybe Hate Speech': 1, 'Hate Speech': 2}

In [9]:
# Function to classify text and convert to numerical label
def classify_and_convert(text):
    prediction = classifier(text, labels)
    # Get the label with the highest score
    predicted_label = prediction['labels'][0]
    # Convert to numerical label
    return label_mapping[predicted_label]

In [10]:
# Function to process and save in batches
def process_and_save_batch(df, start_index, batch_size, output_file_path):
    end_index = min(start_index + batch_size, len(df))
    df_slice = df[start_index:end_index]
    df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)
    df_slice.to_csv(output_file_path.format(start_index), index=False)

In [11]:
# Set batch size and output file path pattern
batch_size = 10000
output_file_path = 'deberta_sequential_output_csv_file_{}.csv'

In [12]:
for start_index in range(0, len(df), batch_size):
    process_and_save_batch(df, start_index, batch_size, output_file_path)
    print(f"Checkpoint saved for batch starting at index {start_index}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 10000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 20000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 30000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 40000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 50000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 60000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 70000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 80000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 90000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 100000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 110000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


Checkpoint saved for batch starting at index 120000
Checkpoint saved for batch starting at index 130000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice['predicted_label'] = df_slice['text'].apply(classify_and_convert)


In [13]:
combined_df = pd.concat([pd.read_csv(output_file_path.format(i)) for i in range(0, len(df), batch_size)], ignore_index=True)
combined_df.to_csv('final_combined_debert_sequential_output.csv', index=False)