In [19]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import (
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizer
)

# Read data
data = []
with open("farm-ads") as f:
    for line in f:
        label, *words = line.strip().split()
        text = " ".join(words)
        data.append((int(label), text))

df = pd.DataFrame(data, columns=["label", "text"])
df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)

# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Dataset class
class FarmAdsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = FarmAdsDataset(train_encodings, train_labels)
test_dataset = FarmAdsDataset(test_encodings, test_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",  # Use "steps" for more frequent evaluation
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return classification_report(labels, preds, output_dict=True)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)

# Make predictions
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(test_labels, pred_labels))

model.save_pretrained("my-finetuned-bert")
tokenizer.save_pretrained("my-finetuned-bert")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg
1,0.2278,0.287744,"{'precision': 0.9058171745152355, 'recall': 0.8582677165354331, 'f1-score': 0.8814016172506739, 'support': 381.0}","{'precision': 0.8846153846153846, 'recall': 0.9241071428571429, 'f1-score': 0.9039301310043668, 'support': 448.0}",0.893848,"{'precision': 0.89521627956531, 'recall': 0.8911874296962881, 'f1-score': 0.8926658741275204, 'support': 829.0}","{'precision': 0.8943595124221918, 'recall': 0.893848009650181, 'f1-score': 0.8935762543576152, 'support': 829.0}"
2,0.1711,0.28497,"{'precision': 0.9175824175824175, 'recall': 0.8766404199475065, 'f1-score': 0.8966442953020134, 'support': 381.0}","{'precision': 0.8989247311827957, 'recall': 0.9330357142857143, 'f1-score': 0.9156626506024096, 'support': 448.0}",0.907117,"{'precision': 0.9082535743826066, 'recall': 0.9048380671166104, 'f1-score': 0.9061534729522115, 'support': 829.0}","{'precision': 0.9074996147995097, 'recall': 0.9071170084439083, 'f1-score': 0.9069220072134457, 'support': 829.0}"
3,0.0714,0.367036,"{'precision': 0.8968253968253969, 'recall': 0.889763779527559, 'f1-score': 0.8932806324110671, 'support': 381.0}","{'precision': 0.9068736141906873, 'recall': 0.9129464285714286, 'f1-score': 0.9098998887652948, 'support': 448.0}",0.902292,"{'precision': 0.9018495055080421, 'recall': 0.9013551040494938, 'f1-score': 0.9015902605881809, 'support': 829.0}","{'precision': 0.9022555553050713, 'recall': 0.902291917973462, 'f1-score': 0.9022618469426642, 'support': 829.0}"


{'eval_loss': 0.2849701941013336, 'eval_0': {'precision': 0.9175824175824175, 'recall': 0.8766404199475065, 'f1-score': 0.8966442953020134, 'support': 381.0}, 'eval_1': {'precision': 0.8989247311827957, 'recall': 0.9330357142857143, 'f1-score': 0.9156626506024096, 'support': 448.0}, 'eval_accuracy': 0.9071170084439083, 'eval_macro avg': {'precision': 0.9082535743826066, 'recall': 0.9048380671166104, 'f1-score': 0.9061534729522115, 'support': 829.0}, 'eval_weighted avg': {'precision': 0.9074996147995097, 'recall': 0.9071170084439083, 'f1-score': 0.9069220072134457, 'support': 829.0}, 'eval_runtime': 5.7876, 'eval_samples_per_second': 143.238, 'eval_steps_per_second': 2.246, 'epoch': 3.0}
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       381
           1       0.90      0.93      0.92       448

    accuracy                           0.91       829
   macro avg       0.91      0.90      0.91       829
weighted avg       0.91      0.9

('my-finetuned-bert/tokenizer_config.json',
 'my-finetuned-bert/special_tokens_map.json',
 'my-finetuned-bert/vocab.txt',
 'my-finetuned-bert/added_tokens.json')

In [16]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Initialize device (automatically detects MPS for Apple Silicon)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def classify_ad(text):
    # Tokenize and move to device
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)

    with torch.no_grad():
        output = model(**tokens)

    pred = torch.argmax(output.logits, dim=1).item()
    return "Accepted" if pred == 1 else "Rejected"

# Example usage
print(classify_ad("giving head behind a dumpster"))


Using device: mps


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accepted
