In [10]:
# Install required libraries
!pip install kagglehub datasets
# Make folder for logs
!mkdir /kaggle/working/logs

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import kagglehub


# TASK 1

path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
df = pd.read_csv(f"{path}/IMDB Dataset.csv")#.head(10000)

In [4]:
print(len(df))

100


In [22]:
# TASK 2

mapping = {"positive": 1, "negative": 0}
df["label"] = df["sentiment"].map(mapping)

df = df.drop(columns=["sentiment"])

In [23]:
from sklearn.model_selection import train_test_split

# Split the data and greate training, validation and testing sets
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [24]:
# TASK 3

# Load the tokenizer and the model

# Load model directly
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                 review  label
9069  Everything is idyllic in Suburbia when the lit...      1
2603  Pretty twisted Horror film, that has a few goo...      0
7738  After watching the trailer I was surprised thi...      1
1579  Okay... for the most part, and all its cheesin...      0
5058  This movie has got to be the biggest disappoin...      0
                                                 review  label
8127  I don't know, maybe I just wasn't in the mood ...      0
4321  ...the opportunity it gave me to look at Irela...      1
450   I find it sad that just because Edward Norton ...      1
5798  Ex-reporter Jacob Asch (Eric Roberts) is hired...      1
5181  What kind of a documentary about a musician fa...      0
                                                 review  label
2697  I'm not sure why the producers needed to trade...      0
6871  A pity, nobody seems to know this little thril...      1
3487  If you are an insomniac and you cant get anyth...

In [25]:
# Tokenize the data
def tokenizer_function(data):
    return tokenizer(
        data["review"].tolist(),
        padding= "max_length",
        truncation=True,
        return_tensors="pt",
        max_length =256
    )

train_encodings = tokenizer_function(train_df)
val_encodings = tokenizer_function(val_df)
test_encodings = tokenizer_function(test_df)

In [26]:
from datasets import Dataset

# Create a Dataset from the tokenized data and labels so that they can be passed to trainer in appropriate form
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_df['label'].tolist()
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_df['label'].tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_df['label'].tolist()
})

In [27]:
# TASK 4
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.argmax(torch.tensor(predictions), axis=-1)  # Convert logits to predicted labels
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions),
        'recall': recall_score(labels, predictions),
        'f1': f1_score(labels, predictions)
    }

training_args = TrainingArguments(
    output_dir='/kaggle/working/logs',
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    report_to="none",
    logging_steps=5,
    logging_first_step=True,
    disable_tqdm=False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [30]:
trainer.train()
# Acc 0.92, Prec 0.91, Recall 0.93, F1 0.92

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2394,0.220915,0.9124,0.928709,0.895085,0.911587
2,0.1097,0.280009,0.9212,0.913923,0.931554,0.922654


TrainOutput(global_step=4376, training_loss=0.20976995940265286, metrics={'train_runtime': 1735.547, 'train_samples_per_second': 40.333, 'train_steps_per_second': 2.521, 'total_flos': 4636358952960000.0, 'train_loss': 0.20976995940265286, 'epoch': 2.0})

In [31]:
# Task 5
# Save the model locally
model.save_pretrained('/content/my_model')
tokenizer.save_pretrained('/content/my_model')

('/content/my_model/tokenizer_config.json',
 '/content/my_model/special_tokens_map.json',
 '/content/my_model/vocab.txt',
 '/content/my_model/added_tokens.json')

In [32]:
import shutil

# Create a zip of the saved model directory
shutil.make_archive('/content/my_model', 'zip', '/content', 'my_model')

'/content/my_model.zip'

In [33]:
from google.colab import files

# Download the zip file locally
files.download('/content/my_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
!pip install huggingface_hub



In [35]:
from huggingface_hub import login

# Log in to your Hugging Face account
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
from huggingface_hub import HfApi

# Initialize Hugging Face API object
api = HfApi()

# Upload model and tokenizer to Hugging Face Model Hub
api.upload_folder(
    folder_path="/content/my_model",
    path_in_repo="distilbert-imdb",
    repo_id="jannetas/distilbert-imdb",
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jannetas/distilbert-imdb/commit/d5db7fc78718debe4e9fd9fd07bb732907d77237', commit_message='Upload folder using huggingface_hub', commit_description='', oid='d5db7fc78718debe4e9fd9fd07bb732907d77237', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jannetas/distilbert-imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='jannetas/distilbert-imdb'), pr_revision=None, pr_num=None)

In [None]:
# REPO URL: https://huggingface.co/jannetas/distilbert-imdb
