<a href="https://colab.research.google.com/github/tanvircr7/meh/blob/master/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Dependencies**

In [43]:
# # Install dependencies (this is mostly for Google Colab, as the other dependences are available by default in Colab)
# try:
#   import datasets, evaluate, accelerate
#   import gradio as gr
# except ModuleNotFoundError:
#   !pip install -U datasets evaluate accelerate gradio # -U stands for "upgrade" so we'll get the latest version by default
#   import datasets, evaluate, accelerate
#   import gradio as gr

try:
  import datasets, evaluate, accelerate
  import gradio as gr
except ModuleNotFoundError:
  !pip install -U datasets evaluate accelerate gradio
  import datasets, evaluate, accelerate
  import gradio as gr


import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Using transformers version: {transformers.__version__}")
print(f"Using datasets version: {datasets.__version__}")
print(f"Using torch version: {torch.__version__}")

Using transformers version: 4.44.2
Using datasets version: 3.0.1
Using torch version: 2.4.1+cu121


# **Hugging Face Token**

In [44]:
from google.colab import userdata
HF_TOKEN_MASTER = userdata.get('HF_TOKEN_MASTER')
HF_TOKEN_MASTER

'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'

# **Create Dataset**

In [45]:
# make my own dataset on huggingface using mistral api

# **Getting Dataset**

In [46]:

from datasets import load_dataset
dataset = load_dataset(path="mrdbourke/learn_hf_food_not_food_image_captions")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [47]:
# dataset['test']

# ***EDA***

In [48]:
dataset['train'][0]

{'text': 'Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
 'label': 'food'}

In [49]:
import random
random_idx = random.sample(range(len(dataset['train'])), 5)
print(random_idx)
print('-----------------')
random_samples = dataset['train'][random_idx]
print(random_samples)
print('-----------------')

print(f"[INFO] Random Samples from dataset:\n")
for text, label in zip(random_samples['text'], random_samples['label']):
  print(f"Text: {text} | Label: {label}")

[184, 222, 141, 1, 78]
-----------------
{'text': ['Tangy fish curry bowl, featuring delicate fish pieces in a zesty sauce made with tamarind and curry leaves, ideal for a light meal.', 'A close-up shot of a ripe and juicy peach with a sprinkle of cinnamon', 'A slice of pizza with a spicy buffalo chicken topping and a drizzle of ranch dressing', 'Set of books stacked on a desk', 'Low-carb sushi roll with cucumber or seaweed wraps instead of rice.'], 'label': ['food', 'food', 'food', 'not_food', 'food']}
-----------------
[INFO] Random Samples from dataset:

Text: Tangy fish curry bowl, featuring delicate fish pieces in a zesty sauce made with tamarind and curry leaves, ideal for a light meal. | Label: food
Text: A close-up shot of a ripe and juicy peach with a sprinkle of cinnamon | Label: food
Text: A slice of pizza with a spicy buffalo chicken topping and a drizzle of ranch dressing | Label: food
Text: Set of books stacked on a desk | Label: not_food
Text: Low-carb sushi roll with cu

In [50]:
dataset['train'].unique('label')

['food', 'not_food']

In [51]:
from collections import Counter
c = Counter(dataset['train']['label'])
c

Counter({'food': 125, 'not_food': 125})

In [52]:
# Dataset into Dataframe
food_df = pd.DataFrame(dataset['train'])
food_df

Unnamed: 0,text,label
0,"Creamy cauliflower curry with garlic naan, fea...",food
1,Set of books stacked on a desk,not_food
2,"Watching TV together, a family has their dog s...",not_food
3,Wooden dresser with a mirror reflecting the room,not_food
4,Lawn mower stored in a shed,not_food
...,...,...
245,Standing floor lamp providing light next to an...,not_food
246,Luxurious coconut shrimp curry on a generous p...,food
247,Barbecue grill waiting on a patio,not_food
248,"Family gathered around a dining table, laughin...",not_food


In [53]:
food_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
food,125
not_food,125


# **Prep Dataset for Text Classification**

In [54]:
dataset['train'].unique('label')

['food', 'not_food']

In [55]:
# mappings from dataset
id2Label = {idx: label for idx, label in enumerate(reversed(dataset['train'].unique('label')))}
id2Label

{0: 'not_food', 1: 'food'}

In [56]:
id2Label = {}
for idx, label in enumerate(reversed(dataset['train'].unique('label'))):
  print(idx, label)
  id2Label[idx]=label


0 not_food
1 food


In [57]:
id2Label

{0: 'not_food', 1: 'food'}

In [58]:
label2Id = {label: idx for idx, label in id2Label.items()}
label2Id

{'not_food': 0, 'food': 1}

now map labels to the numbers

In [59]:
def map_label_to_number(example):
  example['label'] = label2Id[example['label']]
  return example

example_sample = {'text': 'love blueberry pie', 'label': 'food'}
map_label_to_number(example_sample)

{'text': 'love blueberry pie', 'label': 1}

In [60]:
# map our entire dataset labels to numbers using MAP
# dataset.map
dataset = dataset['train'].map(map_label_to_number)
dataset[:5]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

In [61]:


dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [62]:
random_idx_train = random.randint(0, len(dataset['train']))
random_sample_train = dataset['train'][random_idx_train]
random_sample_train

{'text': 'A child playing with a golden retriever in the backyard', 'label': 0}

In [63]:
random_idx_test = random.randint(0, len(dataset['test']))
random_sample_test = dataset['train'][random_idx_test]
random_sample_test

{'text': 'A basket of fresh strawberries with a sprinkle of powdered sugar',
 'label': 1}

# ***Tokenize***

In [64]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased", use_fast=True)
tokenizer



DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [65]:
# test out tokenizer
tokenizer('How are you?')

{'input_ids': [101, 2129, 2024, 2017, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

# ***explore tokenizer***

In [66]:
lenght_of_tokenizer_vocab = len(tokenizer.vocab)
print(f"[INFO] number of items in our tokenizer vocab: {lenght_of_tokenizer_vocab}")
max_tokenizer_input_sequence_length = tokenizer.model_max_length
print(f"[INFO] Max tokenizer input sequence length: {max_tokenizer_input_sequence_length}")

[INFO] number of items in our tokenizer vocab: 30522
[INFO] Max tokenizer input sequence length: 512


In [67]:
tokenizer.vocab['hasan']

17000

In [68]:
tokenizer('hasan')

{'input_ids': [101, 17000, 102], 'attention_mask': [1, 1, 1]}

In [69]:
tokenizer.convert_ids_to_tokens(tokenizer('hasan').input_ids)

['[CLS]', 'hasan', '[SEP]']

if the tokenizer deosn't know a word..

classification token word seprator token


In [70]:
sorted(tokenizer.vocab.items())[:5]

[('!', 999), ('"', 1000), ('#', 1001), ('##!', 29612), ('##"', 29613)]

In [71]:
import random
random.sample(tokenizer.vocab.items(), k=5)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(tokenizer.vocab.items(), k=5)


[('##თ', 29980),
 ('term', 2744),
 ('fucked', 21746),
 ('infection', 8985),
 ('sect', 17831)]

# ***Making a preprocessing function to tokenize text***

In [72]:
def tokenize_text(examples):
  """
  Tokenize given example text and return tokenized text
  """
  return tokenizer(examples['text'],
                   padding=True,
                   truncation=True)

In [73]:
example_sample_2 = {'text': 'I love pizza', 'label': 1}
tokenize_text(example_sample_2)

{'input_ids': [101, 1045, 2293, 10733, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [74]:
long_text = 'I love pizza'*1000
len(long_text)

12000

In [75]:
tokenize_long_text = tokenize_text({'text': long_text, 'label': 1})
len(tokenize_long_text['input_ids'])

512

In [76]:
tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [77]:
# Get two samples from tokenized dataset
train_tokenized_sample = tokenized_dataset['train'][0]
test_tokenized_sample = tokenized_dataset['test'][0]

for key in train_tokenized_sample.keys():
  print(f"[INFO] {key}")
  print(f"Train sample: {train_tokenized_sample[key]}")
  print(f"Test sampel: {test_tokenized_sample[key]}")

[INFO] text
Train sample: Set of headphones placed on a desk
Test sampel: A slice of pepperoni pizza with a layer of melted cheese
[INFO] label
Train sample: 0
Test sampel: 1
[INFO] input_ids
Train sample: [101, 2275, 1997, 2132, 19093, 2872, 2006, 1037, 4624, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Test sampel: [101, 1037, 14704, 1997, 11565, 10698, 10733, 2007, 1037, 6741, 1997, 12501, 8808, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[INFO] attention_mask
Train sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Test sampel: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# ***Evaluation Meric***

In [78]:
import evaluate
import numpy as np
from typing import Tuple

accuracy_metric = evaluate.load("accuracy")
def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
  """
  Computes the accuracy of a model by comparing predictions and labels
  """
  predictions, labels = predictions_and_labels
  return accuracy_metric.compute(predictions=predictions, references=labels)

In [79]:
example_preds_all_correct = np.array([0,0,0,0,0,0,0,0,0,0])
example_preds_one_incorrect = np.array([0,0,0,0,0,1,0,0,0,0])
example_labels = np.array([0,0,0,0,0,0,0,0,0,0])

print(f"Accuracy when all predictions are correct: {compute_accuracy((example_preds_all_correct, example_labels))}")
print(f"Accuracy when one prediction is incorrect: {compute_accuracy((example_preds_one_incorrect, example_labels))}")

Accuracy when all predictions are correct: {'accuracy': 1.0}
Accuracy when one prediction is incorrect: {'accuracy': 0.9}


1. ✅ Create and preprocess data.
2. Define the model we’d like use with transformers.
3. AutoModelForSequenceClassification (or another similar model class).
4. Define training arguments (these are hyperparameters for our model) with transformers.TrainingArguments.
5. Pass TrainingArguments from 3 and target datasets to an instance of transformers.Trainer.
6. Train the model by calling Trainer.train().
7. Save the model (to our local machine or to the Hugging Face Hub).
8. Evaluate the trained model by making and inspecting predctions on the test data.
9. Turn the model into a shareable demo.

# ***Setup Model for Training***

In [83]:
# Get id and label mappings
print(f"id2label: {id2Label}")
print(f"label2id: {label2Id}")

id2label: {0: 'not_food', 1: 'food'}
label2id: {'not_food': 0, 'food': 1}


In [84]:
from transformers import AutoModelForSequenceClassification

# setup model for fine-tuning with classification head (top layers of network)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = "distilbert/distilbert-base-uncased",
    num_labels = 2,
    id2label = id2Label,
    label2id = label2Id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [88]:
test_sample = tokenized_dataset['train'][0]


In [91]:
# model(**test_sample)

In [92]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

create a directory for saving models

In [95]:
# create model output directory
from pathlib import Path

models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

model_save_name = "learn_hf_food_not_food_text_classifier-distilbert-base-uncased"

model_save_dir = Path(models_dir, model_save_name)

model_save_dir

PosixPath('models/learn_hf_food_not_food_text_classifier-distilbert-base-uncased')