In [1]:
from transformers.integrations import accelerate
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import (
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizer
)

# Read data
data = []
with open("farm-ads") as f:
    for line in f:
        label, *words = line.strip().split()
        text = " ".join(words)
        data.append((int(label), text))

df = pd.DataFrame(data, columns=["label", "text"])
df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)

# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Dataset class
class FarmAdsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = FarmAdsDataset(train_encodings, train_labels)
test_dataset = FarmAdsDataset(test_encodings, test_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",  # Use "steps" for more frequent evaluation
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return classification_report(labels, preds, output_dict=True)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)

# Make predictions
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print(classification_report(test_labels, pred_labels))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg
1,0.2277,0.314929,"{'precision': 0.8622448979591837, 'recall': 0.8871391076115486, 'f1-score': 0.8745148771021992, 'support': 381.0}","{'precision': 0.9016018306636155, 'recall': 0.8794642857142857, 'f1-score': 0.8903954802259887, 'support': 448.0}",0.882992,"{'precision': 0.8819233643113996, 'recall': 0.8833016966629171, 'f1-score': 0.882455178664094, 'support': 829.0}","{'precision': 0.8835137831842566, 'recall': 0.8829915560916767, 'f1-score': 0.8830969159435234, 'support': 829.0}"
2,0.1773,0.279083,"{'precision': 0.9430199430199431, 'recall': 0.868766404199475, 'f1-score': 0.9043715846994536, 'support': 381.0}","{'precision': 0.895397489539749, 'recall': 0.9553571428571429, 'f1-score': 0.9244060475161987, 'support': 448.0}",0.915561,"{'precision': 0.919208716279846, 'recall': 0.912061773528309, 'f1-score': 0.9143888161078262, 'support': 829.0}","{'precision': 0.9172842866156886, 'recall': 0.9155609167671894, 'f1-score': 0.9151984114086236, 'support': 829.0}"
3,0.1116,0.350175,"{'precision': 0.9023746701846965, 'recall': 0.8976377952755905, 'f1-score': 0.9, 'support': 381.0}","{'precision': 0.9133333333333333, 'recall': 0.9174107142857143, 'f1-score': 0.9153674832962138, 'support': 448.0}",0.908323,"{'precision': 0.9078540017590149, 'recall': 0.9075242547806524, 'f1-score': 0.907683741648107, 'support': 829.0}","{'precision': 0.9082968427909562, 'recall': 0.9083232810615199, 'f1-score': 0.9083047436872181, 'support': 829.0}"


{'eval_loss': 0.2790827453136444, 'eval_0': {'precision': 0.9430199430199431, 'recall': 0.868766404199475, 'f1-score': 0.9043715846994536, 'support': 381.0}, 'eval_1': {'precision': 0.895397489539749, 'recall': 0.9553571428571429, 'f1-score': 0.9244060475161987, 'support': 448.0}, 'eval_accuracy': 0.9155609167671894, 'eval_macro avg': {'precision': 0.919208716279846, 'recall': 0.912061773528309, 'f1-score': 0.9143888161078262, 'support': 829.0}, 'eval_weighted avg': {'precision': 0.9172842866156886, 'recall': 0.9155609167671894, 'f1-score': 0.9151984114086236, 'support': 829.0}, 'eval_runtime': 5.8348, 'eval_samples_per_second': 142.079, 'eval_steps_per_second': 2.228, 'epoch': 3.0}
              precision    recall  f1-score   support

           0       0.94      0.87      0.90       381
           1       0.90      0.96      0.92       448

    accuracy                           0.92       829
   macro avg       0.92      0.91      0.91       829
weighted avg       0.92      0.92   

In [10]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Initialize device (automatically detects MPS for Apple Silicon)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def classify_ad(text):
    # Tokenize and move to device
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)

    with torch.no_grad():
        output = model(**tokens)

    pred = torch.argmax(output.logits, dim=1).item()
    return "Accepted" if pred == 1 else "Rejected"

# Example usage
print(classify_ad("sell cows cheap"))


Using device: mps


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Rejected
