This notebook demonstrates the process of fine-tuning a pre-trained causal language model (Google Gemma 7B model) to generate job descriptions based on input job titles. The workflow includes data preparation, model fine-tuning using Low-Rank Adaptation (LoRA)

In [2]:
#installing pre requisite libraries
!pip install transformers trl datasets peft bitsandbytes accelerate sentencepiece


Collecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3

Login to Huggingface to access gated/restricted models

In [3]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('hugging_face_accesstoken')

# Log in to Hugging Face
login(token = hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Connecting google drive to access dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
#file_path = '/content/drive/My Drive/Datasets/cleaned_50k-merged_job_data.csv'
# Load the dataset
file_path = '/content/drive/My Drive/Datasets/cleaned_50k-merged_job_data.csv'

Mounted at /content/drive


Importing all the required libraries

In [5]:
import torch
import os
from trl import SFTTrainer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from trl import SFTTrainer

loading and formating the data in trainable format for fine tuning

In [6]:
df = pd.read_csv(file_path, nrows=250)

# Prepare data for fine-tuning
def format_data(row):
    return {
        "input": f"Job title: {row['job_title']} Generate job posting description:",
        "output": row['job_summary']
    }


# Applying formatting function and convert to DataFrame
formatted_data = df.apply(format_data, axis=1).apply(pd.Series)

# Create Dataset object from DataFrame
dataset = Dataset.from_pandas(formatted_data)

Loading model and toenizer and defined tokenize function

In [7]:
# Load tokenizer
model_id = 'google/gemma-7b'

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def tokenize_function(examples):
    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
    outputs = tokenizer(examples["output"], truncation=True, padding="max_length", max_length=128)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": outputs["input_ids"]
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Setup BitsAndBytesConfig for 4-bit quantization and defining LORA Config

In [8]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",            # NormalFloat 4
    bnb_4bit_use_double_quant=True,       # Double quantization for better memory efficiency
    bnb_4bit_compute_dtype=torch.float16  # Mixed precision for faster computation
)

# Load the LLaMA 2 model with 4-bit quantization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,       # Apply 4-bit quantization
    device_map="auto"                     # Automatically map model to GPUs if available
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Configure LoRA for fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   # Causal language modeling task
    r=8,                            # Rank of low-rank adaptation
    lora_alpha=32,                  # LoRA scaling
    lora_dropout=0.1,               # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj"]  # Applied LoRA on query and value projection layers
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 3,211,264 || all params: 8,540,892,160 || trainable%: 0.0376


In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Set up training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=100,
    output_dir="./lora_llama_finetuned",
    optim="paged_adamw_32bit",
    save_total_limit=2,
    save_steps=500,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    dataset_text_field="input",
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=256,
    packing=True,
    formatting_func=format_data
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./Job_Description_Finetuned_Gemma")
tokenizer.save_pretrained("./Job_Description_Finetuned_Gemma")


Map:   0%|          | 0/250 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


('./Job_Description_Finetuned_Gemma/tokenizer_config.json',
 './Job_Description_Finetuned_Gemma/special_tokens_map.json',
 './Job_Description_Finetuned_Gemma/tokenizer.model',
 './Job_Description_Finetuned_Gemma/added_tokens.json',
 './Job_Description_Finetuned_Gemma/tokenizer.json')

In [10]:

# Add a distinct pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define a function to generate job descriptions
def generate_text(prompt, max_length=128):
    # Tokenize the input and explicitly pass the attention mask
    inputs = tokenizer(prompt, return_tensors="pt",max_length=max_length,padding=True, truncation=True).to(device)

    # Ensure attention_mask is passed along with the input_ids
    with torch.no_grad():
        output_ids = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Explicitly pass attention mask
            max_length=max_length,
            do_sample=True,
            temperature=0.7
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)




In [11]:
# Example usage
prompt = "Job title: Senior Web Developer Generate job posting description:"
generated_text = generate_text(prompt)

print("Generated Text:\n", generated_text)

Generated Text:
 Job title: Senior Web Developer Generate job posting description: Senior Web Developer 

Salary: $ 100,000 - $150,000

Job description:

We are looking for an experienced and skilled Senior Web Developer. We are a small and fast-growing startup with a friendly and dynamic work environment.

We want to hire a senior web developer
