In [1]:
# Cell 1: Importing Libraries
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import random
import logging
import time
import os
import json
import shutil
from huggingface_hub import HfApi, HfFolder

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Cell 2: Setting Up Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Authenticate with Hugging Face Model Hub
token = "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"
if not HfFolder.get_token():
    HfFolder.save_token(token)

# Function to log elapsed time
def log_time(start_time, message):
    elapsed_time = time.time() - start_time
    logger.info(f"{message} took {elapsed_time:.2f} seconds")

In [3]:
# Cell 3: Simulating Labels and Tokenization Functions
def simulate_labels(examples):
    labels = [random.randint(0, 1) for _ in range(len(examples['summary']))]
    examples['labels'] = labels
    return examples

# Function to tokenize dataset
def tokenize_function(examples):
    summaries = [summary['text'] for summary in examples['summary']]
    tokenized_inputs = tokenizer(summaries, padding="max_length", truncation=True)
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

In [4]:
# Cell 4: Loading Dataset
try:
    start_time = time.time()
    logger.info("Loading dataset...")
    dataset = datasets.load_dataset("openai/summarize_from_feedback", "axis", split='validation[:5%]', trust_remote_code=True)
    log_time(start_time, "Loading dataset")
except Exception as e:
    logger.error(f"Error loading dataset: {e}")
    raise

INFO:__main__:Loading dataset...
INFO:__main__:Loading dataset took 4.20 seconds


In [5]:
# Cell 5: Preprocessing Dataset
train_dataset = dataset.map(simulate_labels, batched=True)
val_dataset = dataset.map(simulate_labels, batched=True)

In [6]:
# Cell 6: Loading Tokenizer and Model
try:
    start_time = time.time()
    model_name = "distilbert-base-uncased"
    logger.info(f"Loading tokenizer and model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.resize_token_embeddings(len(tokenizer))
    log_time(start_time, "Loading tokenizer and model")
except Exception as e:
    logger.error(f"Error loading tokenizer and model: {e}")
    raise

INFO:__main__:Loading tokenizer and model: distilbert-base-uncased
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Loading tokenizer and model took 1.02 seconds


In [7]:
# Cell 7: Tokenizing Dataset
start_time = time.time()
logger.info("Tokenizing dataset...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
log_time(start_time, "Tokenizing dataset")

INFO:__main__:Tokenizing dataset...
INFO:__main__:Tokenizing dataset took 0.02 seconds


In [8]:
# Cell 8: Checking GPU/MPS Support
use_cuda = torch.cuda.is_available()
use_mps = hasattr(torch._C, "_has_mps") and getattr(torch._C, "_has_mps", False)
logger.info(f"Using CUDA: {use_cuda}, Using MPS: {use_mps}")
device = torch.device("cuda" if use_cuda else "mps" if use_mps else "cpu")
model.to(device)

INFO:__main__:Using CUDA: False, Using MPS: False


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [9]:
# Cell 9: Function to Upload Model to Hugging Face Hub
def upload_model_to_hub(model_dir):
    logger.info(f"Uploading the model {model_dir} to Hugging Face Hub...")
    api = HfApi()

    repo_name = model_dir.split("/")[-1]
    
    # Create repo if not exist
    try:
        repo_url = api.create_repo(repo_id=repo_name, private=False, token=HfFolder.get_token())
    except Exception as e:
        logger.warning(f"Repository {repo_name} may already exist: {e}")
        repo_url = f"https://huggingface.co/{repo_name}"

    # Delete non-git directory if exists
    if os.path.exists(model_dir) and not os.path.exists(os.path.join(model_dir, ".git")):
        logger.warning(f"Deleting non-git directory: {model_dir}")
        shutil.rmtree(model_dir)

    # Upload model to Hugging Face Hub
    try:
        api.upload_folder(
            repo_id=repo_name,
            folder_path=model_dir,
            path_in_repo="",
            token=HfFolder.get_token(),
        )
        logger.info(f"Model successfully uploaded to {repo_url}")
    except Exception as e:
        logger.error(f"Error pushing model to Hugging Face Hub: {e}")

In [10]:
# Cell 10: Hyperparameter Tuning and Training
learning_rates = [3e-5, 5e-5]
batch_sizes = [8, 16]
num_epochs = [1]

results = []

for lr in learning_rates:
    for bs in batch_sizes:
        for epoch in num_epochs:
            logger.info(f"Training with lr={lr}, batch_size={bs}, epochs={epoch}")

            # Set training arguments
            training_args = TrainingArguments(
                output_dir="./results",
                evaluation_strategy="epoch",
                learning_rate=lr,
                per_device_train_batch_size=bs,
                per_device_eval_batch_size=bs,
                num_train_epochs=epoch,
                weight_decay=0.01,
                logging_dir='./logs',
                logging_steps=50,
                fp16=use_cuda,
            )

            # Initialize Trainer
            try:
                start_time = time.time()
                logger.info("Initializing Trainer...")
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset,
                )
                log_time(start_time, "Initializing Trainer")
            except Exception as e:
                logger.error(f"Error initializing Trainer: {e}")
                continue

            # Train the model
            try:
                start_time = time.time()
                logger.info("Starting training...")
                trainer.train()
                log_time(start_time, "Training")
            except Exception as e:
                logger.error(f"Error during training: {e}")
                continue

            # Evaluate the model
            try:
                start_time = time.time()
                logger.info("Evaluating model...")
                eval_result = trainer.evaluate()
                log_time(start_time, "Evaluating model")
            except Exception as e:
                logger.error(f"Error during evaluation: {e}")
                continue

            # Save results
            results.append({
                'learning_rate': lr,
                'batch_size': bs,
                'num_epochs': epoch,
                'eval_loss': eval_result['eval_loss'],
                'eval_accuracy': eval_result.get('eval_accuracy', 'N/A'),
            })

            # Save model
            model_dir = f"dpo_model_lr{lr}_bs{bs}_epoch{epoch}"
            start_time = time.time()
            logger.info(f"Saving the model in {model_dir}...")
            trainer.save_model(model_dir)
            log_time(start_time, "Saving the model")

            # Ensure model is saved to the directory
            if os.path.exists(model_dir):
                print(f"Directory {model_dir} exists.")
                
                # Verify model files are present
                model_files = ["model.safetensors", "config.json", "training_args.bin"]
                missing_files = [file for file in model_files if not os.path.exists(os.path.join(model_dir, file))]
                
                if missing_files:
                    print(f"Missing files in {model_dir}: {missing_files}")
                else:
                    print(f"All required files are present in {model_dir}. Ready to upload.")
                    # Upload the model after validation
                    upload_model_to_hub(model_dir)
            else:
                print(f"Directory {model_dir} does not exist. Please check the model saving process.")

logger.info("Hyperparameter tuning complete.")

# Save results to a file
with open('hyperparameter_tuning_results.json', 'w') as f:
    json.dump(results, f, indent=4)

# Cell 11: Uploading Models for Each Result
for result in results:
    model_dir = f"dpo_model_lr{result['learning_rate']}_bs{result['batch_size']}_epoch{result['num_epochs']}"

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    upload_model_to_hub(model_dir)

INFO:__main__:Training with lr=3e-05, batch_size=8, epochs=1
INFO:__main__:Initializing Trainer...
INFO:__main__:Initializing Trainer took 0.06 seconds
INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss
1,0.6977,0.686896


INFO:__main__:Training took 426.78 seconds
INFO:__main__:Evaluating model...


INFO:__main__:Evaluating model took 116.90 seconds
INFO:__main__:Saving the model in dpo_model_lr3e-05_bs8_epoch1...
INFO:__main__:Saving the model took 0.27 seconds
INFO:__main__:Uploading the model dpo_model_lr3e-05_bs8_epoch1 to Hugging Face Hub...


Directory dpo_model_lr3e-05_bs8_epoch1 exists.
All required files are present in dpo_model_lr3e-05_bs8_epoch1. Ready to upload.


ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr3e-05_bs8_epoch1' is not a directory
INFO:__main__:Training with lr=3e-05, batch_size=16, epochs=1
INFO:__main__:Initializing Trainer...
INFO:__main__:Initializing Trainer took 0.08 seconds
INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss
1,No log,0.657712


INFO:__main__:Training took 406.61 seconds
INFO:__main__:Evaluating model...


INFO:__main__:Evaluating model took 113.80 seconds
INFO:__main__:Saving the model in dpo_model_lr3e-05_bs16_epoch1...
INFO:__main__:Saving the model took 0.26 seconds
INFO:__main__:Uploading the model dpo_model_lr3e-05_bs16_epoch1 to Hugging Face Hub...


Directory dpo_model_lr3e-05_bs16_epoch1 exists.
All required files are present in dpo_model_lr3e-05_bs16_epoch1. Ready to upload.


ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr3e-05_bs16_epoch1' is not a directory
INFO:__main__:Training with lr=5e-05, batch_size=8, epochs=1
INFO:__main__:Initializing Trainer...
INFO:__main__:Initializing Trainer took 0.05 seconds
INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss
1,0.666,0.556283


INFO:__main__:Training took 426.57 seconds
INFO:__main__:Evaluating model...


INFO:__main__:Evaluating model took 116.87 seconds
INFO:__main__:Saving the model in dpo_model_lr5e-05_bs8_epoch1...
INFO:__main__:Saving the model took 0.24 seconds
INFO:__main__:Uploading the model dpo_model_lr5e-05_bs8_epoch1 to Hugging Face Hub...


Directory dpo_model_lr5e-05_bs8_epoch1 exists.
All required files are present in dpo_model_lr5e-05_bs8_epoch1. Ready to upload.


ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr5e-05_bs8_epoch1' is not a directory
INFO:__main__:Training with lr=5e-05, batch_size=16, epochs=1
INFO:__main__:Initializing Trainer...
INFO:__main__:Initializing Trainer took 0.04 seconds
INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss
1,No log,0.460349


INFO:__main__:Training took 407.04 seconds
INFO:__main__:Evaluating model...


INFO:__main__:Evaluating model took 114.23 seconds
INFO:__main__:Saving the model in dpo_model_lr5e-05_bs16_epoch1...
INFO:__main__:Saving the model took 0.30 seconds
INFO:__main__:Uploading the model dpo_model_lr5e-05_bs16_epoch1 to Hugging Face Hub...


Directory dpo_model_lr5e-05_bs16_epoch1 exists.
All required files are present in dpo_model_lr5e-05_bs16_epoch1. Ready to upload.


ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr5e-05_bs16_epoch1' is not a directory
INFO:__main__:Hyperparameter tuning complete.
INFO:__main__:Uploading the model dpo_model_lr3e-05_bs8_epoch1 to Hugging Face Hub...

You already created this model repo
ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr3e-05_bs8_epoch1' is not a directory
INFO:__main__:Uploading the model dpo_model_lr3e-05_bs16_epoch1 to Hugging Face Hub...

You already created this model repo
ERROR:__main__:Error pushing model to Hugging Face Hub: Provided path: 'C:\Users\VUONGLOCTRUONG\Documents\GitHub\lab5_NLP\task\dpo_model_lr3e-05_bs16_epoch1' is not a directory
INFO:__main__:Uploading the model dpo_model_lr5e-05_bs8_epoch1 to Hugging Face Hub...

You already created this model repo
ERROR:__main__:Error pushing model to Hugging Face Hub

In [13]:
import time
model_dir = f"dpo_model_lr{lr}_bs{bs}_epoch{epoch}_{int(time.time())}"


In [14]:
if os.path.exists(model_dir):
    print(f"Directory {model_dir} exists.")
    # Verify model files are present
    model_files = ["pytorch_model.bin", "config.json", "training_args.bin"]
    missing_files = [file for file in model_files if not os.path.exists(os.path.join(model_dir, file))]
    
    if missing_files:
        print(f"Missing files in {model_dir}: {missing_files}")
    else:
        print(f"All required files are present in {model_dir}. Ready to upload.")
        # Proceed to upload
        upload_model_to_hub(model_dir)


In [15]:
model_dir = 'dpo_model_lr5e-05_bs16_epoch1'
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)  # Save tokenizer if needed


('dpo_model_lr5e-05_bs16_epoch1\\tokenizer_config.json',
 'dpo_model_lr5e-05_bs16_epoch1\\special_tokens_map.json',
 'dpo_model_lr5e-05_bs16_epoch1\\vocab.txt',
 'dpo_model_lr5e-05_bs16_epoch1\\added_tokens.json',
 'dpo_model_lr5e-05_bs16_epoch1\\tokenizer.json')

In [16]:
import os

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)


('dpo_model_lr5e-05_bs16_epoch1\\tokenizer_config.json',
 'dpo_model_lr5e-05_bs16_epoch1\\special_tokens_map.json',
 'dpo_model_lr5e-05_bs16_epoch1\\vocab.txt',
 'dpo_model_lr5e-05_bs16_epoch1\\added_tokens.json',
 'dpo_model_lr5e-05_bs16_epoch1\\tokenizer.json')

In [19]:
from huggingface_hub import HfApi
from git import Repo
import os

# Initialize Hugging Face API
api = HfApi()

# Model directory path
model_dir = 'dpo_model_lr5e-05_bs16_epoch1'

# Check if the model directory is a valid git repo
if not os.path.exists(os.path.join(model_dir, '.git')):
    Repo.init(model_dir)  # Initialize git repo in the directory

# Commit and push the model
api.upload_folder(
    folder_path=model_dir,  # The directory to upload
    repo_id="vltruong01/dpo_model_lr5e-05_bs16_epoch1",  # Model repository name on Hugging Face Hub
    commit_message="Upload model dpo_model_lr5e-05_bs16_epoch1"
)


model.safetensors: 100%|██████████| 268M/268M [00:22<00:00, 12.0MB/s] 


CommitInfo(commit_url='https://huggingface.co/vltruong01/dpo_model_lr5e-05_bs16_epoch1/commit/41fa77d17bf8b4b71cee318efc86e41d3a4f2909', commit_message='Upload model dpo_model_lr5e-05_bs16_epoch1', commit_description='', oid='41fa77d17bf8b4b71cee318efc86e41d3a4f2909', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vltruong01/dpo_model_lr5e-05_bs16_epoch1', endpoint='https://huggingface.co', repo_type='model', repo_id='vltruong01/dpo_model_lr5e-05_bs16_epoch1'), pr_revision=None, pr_num=None)