<a href="https://colab.research.google.com/github/tanvircr7/meh/blob/master/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Dependencies**

In [None]:
# # Install dependencies (this is mostly for Google Colab, as the other dependences are available by default in Colab)
# try:
#   import datasets, evaluate, accelerate
#   import gradio as gr
# except ModuleNotFoundError:
#   !pip install -U datasets evaluate accelerate gradio # -U stands for "upgrade" so we'll get the latest version by default
#   import datasets, evaluate, accelerate
#   import gradio as gr

try:
  import datasets, evaluate, accelerate
  import gradio as gr
except ModuleNotFoundError:
  !pip install -U datasets evaluate accelerate gradio
  import datasets, evaluate, accelerate
  import gradio as gr


import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Using transformers version: {transformers.__version__}")
print(f"Using datasets version: {datasets.__version__}")
print(f"Using torch version: {torch.__version__}")

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py

# **Hugging Face Token**

In [None]:
from google.colab import userdata
HF_TOKEN_MASTER = userdata.get('HF_TOKEN_MASTER')
HF_TOKEN_MASTER

'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'

# **Create Dataset**

In [None]:
# make my own dataset on huggingface using mistral api

# **Getting Dataset**

In [None]:

from datasets import load_dataset
dataset = load_dataset(path="mrdbourke/learn_hf_food_not_food_image_captions")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [None]:
# dataset['test']

# ***EDA***

In [None]:
dataset['train'][0]

{'text': 'Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
 'label': 'food'}

In [None]:
import random
random_idx = random.sample(range(len(dataset['train'])), 5)
print(random_idx)
print('-----------------')
random_samples = dataset['train'][random_idx]
print(random_samples)
print('-----------------')

print(f"[INFO] Random Samples from dataset:\n")
for text, label in zip(random_samples['text'], random_samples['label']):
  print(f"Text: {text} | Label: {label}")

[95, 142, 147, 175, 197]
-----------------
{'text': ['Set of keys hanging on a hook by the door', 'A slice of pizza with a generous amount of shredded parmesan cheese on top', 'Eggplant in a bowl, sprinkled with feta cheese and served with a side of tomato sauce for a tasty, Mediterranean-inspired dish.', 'Set of pillows arranged on a couch', 'Pizza with a stuffed crust, oozing with cheese'], 'label': ['not_food', 'food', 'food', 'not_food', 'food']}
-----------------
[INFO] Random Samples from dataset:

Text: Set of keys hanging on a hook by the door | Label: not_food
Text: A slice of pizza with a generous amount of shredded parmesan cheese on top | Label: food
Text: Eggplant in a bowl, sprinkled with feta cheese and served with a side of tomato sauce for a tasty, Mediterranean-inspired dish. | Label: food
Text: Set of pillows arranged on a couch | Label: not_food
Text: Pizza with a stuffed crust, oozing with cheese | Label: food


In [None]:
dataset['train'].unique('label')

['food', 'not_food']

In [None]:
from collections import Counter
c = Counter(dataset['train']['label'])
c

Counter({'food': 125, 'not_food': 125})

In [None]:
# Dataset into Dataframe
food_df = pd.DataFrame(dataset['train'])
food_df

Unnamed: 0,text,label
0,"Creamy cauliflower curry with garlic naan, fea...",food
1,Set of books stacked on a desk,not_food
2,"Watching TV together, a family has their dog s...",not_food
3,Wooden dresser with a mirror reflecting the room,not_food
4,Lawn mower stored in a shed,not_food
...,...,...
245,Standing floor lamp providing light next to an...,not_food
246,Luxurious coconut shrimp curry on a generous p...,food
247,Barbecue grill waiting on a patio,not_food
248,"Family gathered around a dining table, laughin...",not_food


In [None]:
food_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
food,125
not_food,125


# **Prep Dataset for Text Classification**

In [None]:
dataset['train'].unique('label')

['food', 'not_food']

In [None]:
# mappings from dataset
id2Label = {idx: label for idx, label in enumerate(reversed(dataset['train'].unique('label')))}
id2Label

{0: 'not_food', 1: 'food'}

In [None]:
id2Label = {}
for idx, label in enumerate(reversed(dataset['train'].unique('label'))):
  print(idx, label)
  id2Label[idx]=label


0 not_food
1 food


In [None]:
id2Label

{0: 'not_food', 1: 'food'}

In [None]:
label2Id = {label: idx for idx, label in id2Label.items()}
label2Id

{'not_food': 0, 'food': 1}

now map labels to the numbers

In [None]:
def map_label_to_number(example):
  example['label'] = label2Id[example['label']]
  return example

example_sample = {'text': 'love blueberry pie', 'label': 'food'}
map_label_to_number(example_sample)

{'text': 'love blueberry pie', 'label': 1}

In [None]:
# map our entire dataset labels to numbers using MAP
# dataset.map
dataset = dataset['train'].map(map_label_to_number)
dataset[:5]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

In [None]:


dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [None]:
random_idx_train = random.randint(0, len(dataset['train']))
random_sample_train = dataset['train'][random_idx_train]
random_sample_train

{'text': 'A colorful bowl of mixed carrots, including orange and purple.',
 'label': 1}

In [None]:
random_idx_test = random.randint(0, len(dataset['test']))
random_sample_test = dataset['train'][random_idx_test]
random_sample_test

{'text': 'Fresh cherry tomatoes in a basket, sprinkled with sea salt for a savory snack.',
 'label': 1}

# ***Tokenize***

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased", use_fast=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# test out tokenizer
tokenizer('How are you?')

{'input_ids': [101, 2129, 2024, 2017, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

# ***explore tokenizer***

In [None]:
lenght_of_tokenizer_vocab = len(tokenizer.vocab)
print(f"[INFO] number of items in our tokenizer vocab: {lenght_of_tokenizer_vocab}")
max_tokenizer_input_sequence_length = tokenizer.model_max_length
print(f"[INFO] Max tokenizer input sequence length: {max_tokenizer_input_sequence_length}")

[INFO] number of items in our tokenizer vocab: 30522
[INFO] Max tokenizer input sequence length: 512


In [38]:
tokenizer.vocab['hasan']

17000

In [39]:
tokenizer('hasan')

{'input_ids': [101, 17000, 102], 'attention_mask': [1, 1, 1]}

In [41]:
tokenizer.convert_ids_to_tokens(tokenizer('hasan').input_ids)

['[CLS]', 'hasan', '[SEP]']

if the tokenizer deosn't know a word..

classification token word seprator token


In [42]:
sorted(tokenizer.vocab.items())[:5]

[('!', 999), ('"', 1000), ('#', 1001), ('##!', 29612), ('##"', 29613)]

In [43]:
import random
random.sample(tokenizer.vocab.items(), k=5)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(tokenizer.vocab.items(), k=5)


[('trial', 3979),
 ('conditional', 18462),
 ('立', 1931),
 ('conduct', 6204),
 ('##nais', 28020)]

# ***Making a preprocessing function to tokenize text***

In [44]:
def tokenize_text(examples):
  """
  Tokenize given example text and return tokenized text
  """
  return tokenizer(examples['text'],
                   padding=True,
                   truncation=True)

In [45]:
example_sample_2 = {'text': 'I love pizza', 'label': 1}
tokenize_text(example_sample_2)

{'input_ids': [101, 1045, 2293, 10733, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [50]:
long_text = 'I love pizza'*1000
len(long_text)

12000

In [51]:
tokenize_long_text = tokenize_text({'text': long_text, 'label': 1})
len(tokenize_long_text['input_ids'])

512

In [52]:
tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)
tokenized_dataset

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [None]:
# Get two samples from tokenized dataset