In [None]:
# Finetune a model on English data
# You will need to add the attached datasets to your Google drive

# Install necessary libraries
!pip install --upgrade datasets transformers huggingface_hub

# Import required packages
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import os
import numpy as np
import pandas as pd
from huggingface_hub import notebook_login
from google.colab import drive

# Authenticate with Hugging Face Hub
notebook_login()

# Disable W&B logging (optional)
os.environ["WANDB_MODE"] = "disabled"

# Mount Google Drive with error handling
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Error mounting Google Drive: {e}. Please check your permissions and try again.")

# Define paths for the datasets in Google Drive
data_folder = '/content/drive/My Drive/colab_data/'
train_path = os.path.join(data_folder, 'true_train.csv')
test_path = os.path.join(data_folder, 'true_test.csv')

# Load datasets using pandas
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Verify that labels are correctly encoded
print("Train labels:", train_df['labels'].unique())
print("Test labels:", test_df['labels'].unique())

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply tokenization and remove the 'text' column
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Set format for PyTorch (include only necessary columns)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',  # Updated parameter name
    logging_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.1,
    push_to_hub=True,
    report_to=None,
    hub_model_id='xabackus/sexism-detector-English-8832e-501', # Replace with your Hugging Face Hub model ID
)

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save and push the model to Hugging Face Hub
trainer.push_to_hub()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train labels: [0 1]
Test labels: [0 1]


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5696,0.556322,0.7575,0.65298
2,0.5631,0.557803,0.7575,0.65298
3,0.5497,0.486334,0.7635,0.758797


events.out.tfevents.1732600148.a978c963c651.2451.3:   0%|          | 0.00/7.20k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xabackus/sexism-detector-English-8832e-501/commit/0bfde27b03fcc936f0d96b71e94701142eead0cd', commit_message='End of training', commit_description='', oid='0bfde27b03fcc936f0d96b71e94701142eead0cd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/xabackus/sexism-detector-English-8832e-501', endpoint='https://huggingface.co', repo_type='model', repo_id='xabackus/sexism-detector-English-8832e-501'), pr_revision=None, pr_num=None)

In [None]:
# Finetune a model on Spanish data
# You will need to add the attached datasets to your Google drive

# Install necessary libraries
!pip install --upgrade datasets transformers huggingface_hub

# Import required packages
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import os
import numpy as np
import pandas as pd
from huggingface_hub import notebook_login
from google.colab import drive

# Authenticate with Hugging Face Hub
notebook_login()

# Disable W&B logging (optional)
os.environ["WANDB_MODE"] = "disabled"

# Mount Google Drive with error handling
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Error mounting Google Drive: {e}. Please check your permissions and try again.")

# Define paths for the dataset in Google Drive
data_folder = '/content/drive/My Drive/colab_data/'
train_path = os.path.join(data_folder, 'true_train_spanish.csv')

# Load the dataset using pandas
train_df = pd.read_csv(train_path)

# Optional: Check the total number of rows to ensure it's greater than 1800
total_rows = len(train_df)
if total_rows < 1800:
    raise ValueError(f"The training dataset contains only {total_rows} rows, which is less than the required 1800 rows for training.")

# Split the DataFrame into training and validation sets
train_df_train = train_df.iloc[:1800].reset_index(drop=True)  # First 1800 rows for training
train_df_val = train_df.iloc[1800:].reset_index(drop=True)    # Remaining rows for validation

# Verify the split
print(f"Training set size: {len(train_df_train)}")
print(f"Validation set size: {len(train_df_val)}")

# Verify that labels are correctly encoded
print("Train labels:", train_df_train['labels'].unique())
print("Validation labels:", train_df_val['labels'].unique())

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df_train)
val_dataset = Dataset.from_pandas(train_df_val)

# Load the pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply tokenization and remove the 'text' column
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Set format for PyTorch (include only necessary columns)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',  # Evaluation is done at the end of each epoch
    logging_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.001,
    push_to_hub=True,
    report_to=None,  # Disable reporting to other services
    hub_model_id='xabackus/sexism-detector-Spanish-8822e-50001',  # Replace with your Hugging Face Hub model ID
)

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation dataset instead of test dataset
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save and push the model to Hugging Face Hub
trainer.push_to_hub()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training set size: 1800
Validation set size: 114
Train labels: [1 0]
Validation labels: [0 1]


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5005,0.590039,0.824561,0.745277
2,0.4722,0.474503,0.824561,0.745277


CommitInfo(commit_url='https://huggingface.co/xabackus/sexism-detector-Spanish-8822e-50001/commit/6b18f522cbcd0f551500fb2378088a4f0713d449', commit_message='End of training', commit_description='', oid='6b18f522cbcd0f551500fb2378088a4f0713d449', pr_url=None, repo_url=RepoUrl('https://huggingface.co/xabackus/sexism-detector-Spanish-8822e-50001', endpoint='https://huggingface.co', repo_type='model', repo_id='xabackus/sexism-detector-Spanish-8822e-50001'), pr_revision=None, pr_num=None)

In [None]:
# Run your classification model on any piece of text you desire

# Install necessary libraries
!pip install --upgrade transformers torch

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model from the Hugging Face Hub
model_name = 'xabackus/sexism-detector-English-8812e-5001'  # Replace with your model's repository and name

print("Loading the tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-large')
print("Tokenizer loaded successfully.")

print("\nLoading the model...")
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print("Model loaded successfully.")

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
model.to(device)
print("Model moved to device successfully.")

# Define the label mapping (adjust according to your training)
label_mapping = {0: 'non-sexist', 1: 'sexist'}

# Function to classify text
def classify_text(text):
    """
    Classify the input text as 'sexist' or 'non-sexist'.

    Args:
        text (str): The text to classify.

    Returns:
        str: The predicted label.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Run inference without tracking gradients
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class index
    predicted_class_idx = torch.argmax(logits, dim=1).item()

    # Map the index to the label
    predicted_label = label_mapping.get(predicted_class_idx, "Unknown")

    return predicted_label

# Sample texts to classify
sample_texts = [
    "me gustan los perros",
    "woman bad"
]

# Classify each sample text
for text in sample_texts:
    prediction = classify_text(text)
    print(f"\nText: {text}")
    print(f"Predicted label: {prediction}")


Loading the tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizer loaded successfully.

Loading the model...


model.safetensors:  10%|#         | 231M/2.24G [00:00<?, ?B/s]

Model loaded successfully.

Using device: cuda
Model moved to device successfully.

Text: me gustan los perros
Predicted label: non-sexist

Text: woman bad
Predicted label: non-sexist


In [None]:
# Test Your Model Against an evaluation dataset
# This is not provided because EXIST is a closed-source dataset

# Mount Google Drive to access the CSV file
from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install transformers

# Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Load the CSV file
data_path = '/content/drive/My Drive/colab_data/exist2021_true.csv'
df = pd.read_csv(data_path)

# Separate into English and Spanish datasets
df_en = df[df['language'] == 'en'].reset_index(drop=True)
df_es = df[df['language'] == 'es'].reset_index(drop=True)

# Define the Dataset class
class SexismDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Load the model and tokenizer
model_name = 'xabackus/sexism-detector-Spanish-8842e-310'
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Create datasets
def create_dataset(df, tokenizer):
    texts = df['text'].tolist()
    labels = df['labels'].tolist()
    dataset = SexismDataset(texts, labels, tokenizer)
    return dataset

dataset_en = create_dataset(df_en, tokenizer)
dataset_es = create_dataset(df_es, tokenizer)

# Create dataloaders
dataloader_en = DataLoader(dataset_en, batch_size=16)
dataloader_es = DataLoader(dataset_es, batch_size=16)

# Define evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []

    for batch in dataloader:
        with torch.no_grad():
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)
    try:
        auc = roc_auc_score(true_labels, preds)
    except ValueError:
        auc = 'Undefined (only one class present in y_true)'

    metrics = {
        'loss': avg_loss,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }
    return metrics

# Evaluate on English data
metrics_en = evaluate(model, dataloader_en)
print('English Dataset Metrics:')
for key, value in metrics_en.items():
    print(f'{key}: {value}')

# Evaluate on Spanish data
metrics_es = evaluate(model, dataloader_es)
print('\nSpanish Dataset Metrics:')
for key, value in metrics_es.items():
    print(f'{key}: {value}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


English Dataset Metrics:
loss: 0.9324476803126542
accuracy: 0.47554347826086957
f1: 0.0
precision: 0.0
recall: 0.0
auc: 0.5

Spanish Dataset Metrics:
loss: 0.9263935852933813
accuracy: 0.48009259259259257
f1: 0.0
precision: 0.0
recall: 0.0
auc: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
