<a href="https://colab.research.google.com/github/yogesh-bhattarai/GEN_AI/blob/main/gen_ai_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Config

In [5]:
import torch
class Config:
  DATASET_ID= 'emad12/stock_tweets_sentiment'
  MODEL_CKPT= 'distilbert/distilbert-base-uncased'
  SRC_COLUMN='tweet'
  TGT_COLUMN='sentiment'
  TEST_SIZE= 0.2
  SEED= 42
  MAX_LEN= 32
  DEVICE= "cuda" if torch.cuda.is_available() else "cpu"
  ID2LABEL={0:'NEUTRAL',1:'POSITIVE',2:'NEGATIVE'}
  LABEL2ID={'NEUTRAL':0,'POSITIVE':1,'NEGATIVE':2}
  EVAL_METRIC='accuracy'
  MODEL_OUT_DIR='distilbert-stock-tweet-sentiment-analysis'
  LR= 2e-5
  NUM_EPOCHS= 3
  BATCH_SIZE= 16
  WEIGHT_DECAY= 0.01
  EVAL_STRATEGY= 'epoch'
  SAVE_STRATEGY= 'epoch'
  LOGGING_STRATEGY= 'epoch'
  PUSH_TO_HUB= True

config= Config()

datasets

In [6]:
from transformers import AutoTokenizer,AutoModel, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

class TextclassificationDataset:
  def __init__(self):
    self.dataset_id= config.DATASET_ID
    self.model_ckpt= config.MODEL_CKPT
    self.src_column= config.SRC_COLUMN
    self.tgt_column= config.TGT_COLUMN
    self.test_size= config.TEST_SIZE
    self.seed= config.SEED
    self.max_len= config.MAX_LEN
    self.model_out_dir= config.MODEL_OUT_DIR
    self.lr= config.LR
    self.num_epochs= config.NUM_EPOCHS
    self.batch_size=config.BATCH_SIZE
    self.weight_decay= config.WEIGHT_DECAY
    self.eval_strategy= config.EVAL_STRATEGY
    self.save_strategy= config.SAVE_STRATEGY
    self.logging_strategy= config.LOGGING_STRATEGY
    self.push_to_hub= config.PUSH_TO_HUB

    self.tokenizer= AutoTokenizer.from_pretrained(self.model_ckpt)



  def create_data(self):
    self.data= load_dataset(self.dataset_id,split='train')
    self.df= self.data.to_pandas()
    self.df= self.df[[self.src_column, self.tgt_column]]
    self.df[self.tgt_column]= self.df[self.tgt_column].apply(lambda x: 2 if x==-1 else x)
    self.df[self.src_column]=self.df[self.src_column].apply(lambda x: x.lower())
    self.df= self.df.sample(20000)
    self.train_df, self.test_df= train_test_split(self.df,test_size=self.test_size, shuffle=True, random_state=self.seed)
    self.train_data= Dataset.from_pandas(self.train_df)
    self.test_data= Dataset.from_pandas(self.test_df)
    return self.train_data,self.test_data


  def tokenize_function(self, example):
    model_inp= self.tokenizer(example[self.src_column],truncation=True,padding=True, max_length=self.max_len)
    labels= torch.tensor(example[self.tgt_column],dtype= torch.int)
    model_inp['labels']= labels
    return model_inp


  def preprocess_function(self,data):
    model_inp= data.map(self.tokenize_function,batched=True,remove_columns = data.column_names)
    return model_inp


  def gen_classification_dataset(self):
    train_data, test_data= self.create_data()
    train_tokenized_data=self.preprocess_function(train_data)
    test_tokenized_data= self.preprocess_function(test_data)
    return train_tokenized_data, test_tokenized_data





model

In [7]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments,DataCollatorWithPadding
import evaluate
import numpy as np
class TextClassificationModelTrainer:
  def __init__(self,train_data,test_data):
    self.train_data= train_data
    self.test_data= test_data
    self.model_ckpt= config.MODEL_CKPT
    self.id2label= config.ID2LABEL
    self.label2id= config.LABEL2ID
    self.device= config.DEVICE
    self.eval_metric= config.EVAL_METRIC
    self.model_out_dir = config.MODEL_OUT_DIR
    self.num_epochs = config.NUM_EPOCHS
    self.lr = config.LR
    self.batch_size = config.BATCH_SIZE
    self.weight_decay = config.WEIGHT_DECAY
    self.eval_strategy = config.EVAL_STRATEGY
    self.save_strategy = config.SAVE_STRATEGY
    self.logging_strategy = config.LOGGING_STRATEGY
    self.push_to_hub = config.PUSH_TO_HUB
    self.model= AutoModelForSequenceClassification.from_pretrained(
        self.model_ckpt,
        id2label= self.id2label,
        label2id= self.label2id,
    ).to(self.device)
    self.tokenizer= AutoTokenizer.from_pretrained(self.model_ckpt)
    self.eval_metrics_computer= evaluate.load(self.eval_metric)
    self.data_collator= DataCollatorWithPadding(self.tokenizer)


  def compute_metrics(self,eval_pred):
    logits,labels= eval_pred
    predictions= np.argmax(logits,axis=-1)
    return self.eval_metrics_computer.compute(predictions=predictions,references=labels)


  def set_training_args(self):
    return TrainingArguments(
        output_dir= self.model_out_dir,
        learning_rate= self.lr,
        num_train_epochs= self.num_epochs,
        weight_decay= self.weight_decay,
        evaluation_strategy= self.eval_strategy,
        save_strategy= self.save_strategy,
        logging_strategy= self.logging_strategy,
        per_device_train_batch_size= self.batch_size,
        per_device_eval_batch_size= self.batch_size,
        push_to_hub= self.push_to_hub,
        hub_token='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'
    )
  def model_trainer(self):
    return Trainer(
        model= self.model,
        args= self.set_training_args(),
        data_collator= self.data_collator,
        train_dataset= self.train_data,
        eval_dataset= self.test_data,
        compute_metrics= self.compute_metrics
    )
  def train_and_save_and_push_to_hub(self):
    trainer= self.model_trainer()
    trainer.train()
    trainer.push_to_hub()




In [8]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
if __name__ == "__main__":
    textclassificationdataset = TextclassificationDataset()
    train_data, test_data = textclassificationdataset.gen_classification_dataset()
    textclassificationtrainer = TextClassificationModelTrainer(train_data, test_data)
    textclassificationtrainer.train_and_save_and_push_to_hub()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



README.md:   0%|          | 0.00/806 [00:00<?, ?B/s]

(…)-00000-of-00001-49baa0648effea14.parquet:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

(…)-00000-of-00001-cb0233e05c1cc1c9.parquet:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/96000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.689,0.647076,0.75075
2,0.484,0.598971,0.7675
3,0.3706,0.626841,0.7755


In [20]:
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
# Assuming config.MODEL_CKPT points to the location of the DistilBertTokenizer
model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_OUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CKPT) # Load tokenizer from checkpoint
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
classifier("i wanna kill youu")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9797311425209045}]

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
