In [1]:
pip install transformers torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

In [3]:
# Load DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # Adjust num_labels based on your task


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import Dataset

# Sample data (text and labels)
data = {
    "text": ["I love this!", "This is so bad", "Amazing product", "I hate this", "So good!", "Terrible experience"],
    "label": [1, 0, 1, 0, 1, 0]  # Labels: 1 = positive, 0 = negative
}

# Convert data to Hugging Face Dataset format
dataset = Dataset.from_dict(data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply the tokenizer to the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [7]:
# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))  # 80% for training
train_dataset = tokenized_dataset.shuffle(seed=42).select(range(train_size))  # Select first 80% for training
eval_dataset = tokenized_dataset.shuffle(seed=42).select(range(train_size, len(dataset)))  # Select last 20% for validation


In [10]:
training_args = TrainingArguments(
    output_dir='./results',           # Where to save results
    evaluation_strategy="epoch",      # Evaluate every epoch
    save_strategy="epoch",            # Save model every epoch
    num_train_epochs=5,               
    per_device_train_batch_size=4,    # Batch size for training (smaller for faster training)
    per_device_eval_batch_size=4,     # Batch size for evaluation
    logging_dir='./logs',             # Log directory
)




In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start training
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.692884
2,No log,0.694373
3,No log,0.697361
4,No log,0.700968
5,No log,0.701742


TrainOutput(global_step=5, training_loss=0.5916055679321289, metrics={'train_runtime': 19.1817, 'train_samples_per_second': 1.043, 'train_steps_per_second': 0.261, 'total_flos': 2649347973120.0, 'train_loss': 0.5916055679321289, 'epoch': 5.0})

In [12]:
# Evaluate the model on the evaluation dataset
trainer.evaluate()


{'eval_loss': 0.7017420530319214,
 'eval_runtime': 0.365,
 'eval_samples_per_second': 5.479,
 'eval_steps_per_second': 2.74,
 'epoch': 5.0}

In [13]:
# Save the model
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

('./final_model\\tokenizer_config.json',
 './final_model\\special_tokens_map.json',
 './final_model\\vocab.txt',
 './final_model\\added_tokens.json')

In [17]:
from sklearn.metrics import accuracy_score
import torch

# Define the evaluation metrics (e.g., accuracy)
def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)  # Convert logits to a PyTorch tensor
    predictions = torch.argmax(logits, dim=-1)  # Use argmax to get predicted class
    return {'accuracy': accuracy_score(labels, predictions)}


In [18]:
trainer = Trainer(
    model=model,                         # Your trained model
    args=training_args,                  # Training arguments
    eval_dataset=eval_dataset,           # Validation dataset
    compute_metrics=compute_metrics      # Metrics for evaluation
)

# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print("Validation Results:", eval_results)


Validation Results: {'eval_loss': 0.7017420530319214, 'eval_model_preparation_time': 0.0011, 'eval_accuracy': 0.0, 'eval_runtime': 0.3226, 'eval_samples_per_second': 6.199, 'eval_steps_per_second': 3.1}


In [19]:
# Example of making predictions on new data
new_data = ["Example sentence to classify.", "Another example sentence."]
tokenized_input = tokenizer(new_data, padding=True, truncation=True, return_tensors="pt")

# Get predictions from the model
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode
    outputs = model(**tokenized_input)
    predictions = outputs.logits.argmax(axis=-1)

print("Predictions:", predictions)


Predictions: tensor([0, 0])
