In [None]:
import pandas as pd
import random

csv_file = '/Users/yb/Coding Projects Fixed/Toxicfiltering/jigsaw-toxic-comment-train-processed-seqlen128 copy.csv' 
df = pd.read_csv(csv_file)

num_rows_to_delete = len(df) // 2

rows_to_delete = random.sample(range(len(df)), num_rows_to_delete)

df_dropped = df.drop(rows_to_delete)

output_file = 'jigsaw-toxic.csv'  
df_dropped.to_csv(output_file, index=False)

print(f"50% of the rows have been deleted and saved to {output_file}")


In [None]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('jigsaw-toxic.csv')

df = df[['comment_text', 'toxic']]

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['comment_text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column("toxic", "labels")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

device = torch.device("mps")
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model.save_pretrained("toxic-filter-distilbert2")

tokenizer.save_pretrained("toxic-filter-distilbert2")

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_dir = "toxic-filter-distilbert2"

model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("mps")
model.to(device)

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    return predicted_class, probabilities

text = ""
predicted_class, probabilities = predict(text)
print(f"Predicted class: {predicted_class}")
print(f"Probabilities: {probabilities}")