In [1]:
import torch
torch.cuda.is_available()

False

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
df = pd.read_csv("data/IMDB_Dataset.csv")
# change sentiment to 0 or 1 for negative or positive, respectively 
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
dataset = Dataset.from_pandas(df)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load pre-trained distilbert and tokenizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True, return_tensors="pt", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 50000/50000 [03:28<00:00, 239.26 examples/s]


In [4]:
# split data 80/20
train_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in range(int(0.8 * len(tokenized_dataset)))])
test_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset))])
# rename sentiment column to label
train_dataset = train_dataset.rename_column("sentiment", "label")
test_dataset = test_dataset.rename_column("sentiment", "label")

In [5]:
# import a data collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

In [6]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# metrics to be computed
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, preds)
    
    # Calculate precision, recall, and F1 score
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
# training model on data
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model('./fine_tuned_distilbert')
results = trainer.evaluate
print(results)

  trainer = Trainer(
  1%|          | 53/5000 [22:34<34:31:28, 25.12s/it]

KeyboardInterrupt: 