In [None]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
# FIX: Pinned fsspec and gcsfs to 2024.12.0 to satisfy datasets==3.3.2 requirements
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.21.0" \
  "peft==0.14.0" \
  "protobuf>=3.20.3,<6.0.0dev" \
  "fsspec==2024.12.0" \
  "gcsfs==2024.12.0" \
  sentencepiece

# COMMENT IN: if you are running on a GPU that supports BF16 data type and flash attn, such as NVIDIA L4 or NVIDIA A100
%pip install flash-attn

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn


In [5]:
import pandas as pd
from getpass import getpass
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
import threading
from tqdm import tqdm
import json
import time
from datetime import datetime
import evaluate

print("Loading descriptions dataset...")
descriptions = pd.read_parquet("hf://datasets/lang-uk/recruitment-dataset-job-descriptions-english/data/train-00000-of-00001.parquet")
print("Descriptions head:")
print(descriptions.head())
print("-" * 30)

print("Loading profiles dataset...")
profile = pd.read_parquet("hf://datasets/lang-uk/recruitment-dataset-candidate-profiles-english/data/train-00000-of-00001.parquet")
print("Profiles head:")
print(profile.head())
print("-" * 30)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print("Logging in to Hugging Face Hub...")

print("Loading metrics...")

try:
    HF_TOKEN = getpass()
    login(HF_TOKEN)
    print("Login successful.")
except Exception as e:
    print(f"Login failed: {e}")
print("-" * 30)

Loading descriptions dataset...
Descriptions head:
                                            Position  \
0      10 + Blockchain Nodes / Masternodes to set up   
1       10 .NET Developers (Middle and Senior level)   
2  10X Engineer (co-founder, #4 employee, USD 11-...   
3                          16 - Amazon Brand Manager   
4                          16 - Amazon Brand Manager   

                                    Long Description    Company Name  \
0  *Requirements*\r\n\r\nWe're looking for a long...    MyCointainer   
1  Greetings! My name is Maria, I am in urgent ne...  TechScout.tech   
2  **Product**\r\nThe product is a live video cha...        Innoteka   
3  Currently, TCM expanding its activities to Ukr...       FirstFive   
4  Hello,\r\nWe, MIMIRB2B, are an outstaff compan...        MimirB2B   

  Exp Years Primary Keyword English Level                  Published  \
0        2y        Sysadmin  intermediate  2020-10-01T00:00:00+03:00   
1        2y            .NET  interm

In [6]:
from datasets import Dataset

complex_cot = (
    "- Identify key skills from the candidate's past roles.\n"
    "- Match these skills to the job description keywords.\n"
    "- Prioritize experiences that show measurable achievements."
)

system_instruction = f"""Below is an instruction that describes a task, paired with an input that provides candidate details and a target job.
Write a professional, ATS-friendly resume tailored to the target role.
### Strategy:
{complex_cot}

First, produce a concise 2–4 bullet **Plan**. Then generate the resume.
Use clear section headers. For experience bullets, use the STAR/impact style.

Output format:
Plan:
- <short bullet 1>
- <short bullet 2>

Resume:
[Use sections: Summary, Experience, Education, Skills, Projects/Certifications]"""

user_prompt = """### Candidate details / Job target:
{}

### Additional instructions:
{}"""


def format_prompts_and_refs(df, extra_instructions="Tone: professional, one-page."):
    candidate_details = [
        f"Position: {p}\nMore info: {m}\nLooking For: {l}\nHighlights: {h}\nPrimary Keyword: {k}"
        for p, m, l, h, k in zip(
            df["Position"].fillna(''),
            df["Moreinfo"].fillna(''),
            df["Looking For"].fillna(''),
            df["Highlights"].fillna(''),
            df["Primary Keyword"].fillna('')
        )
    ]

    prompts = [
        user_prompt.format(details, extra_instructions)
        for details in candidate_details
    ]

    references = [
        f"{h}\n{m}"
        for h, m in zip(df["Highlights"].fillna(''), df["Moreinfo"].fillna(''))
    ]

    return prompts, references

formatted_prompts, formatted_refs = format_prompts_and_refs(profile)

# Create dataset with messages - NO pre-tokenization to save memory
# The tokenization will be done lazily by the trainer
def create_messages_list():
    messages_list = []
    for prompt, resume in zip(formatted_prompts, profile["CV"]):
        messages_list.append({
            "messages": [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": resume}
            ]
        })
    return messages_list

print("Creating dataset with lazy loading...")
messages_data = create_messages_list()
dataset = Dataset.from_list(messages_data)

print(f"Dataset size: {len(dataset)} samples")
print("Sample user prompt:")
print(dataset[0]["messages"][1]["content"][:500], "...")

Map:   0%|          | 0/210250 [00:00<?, ? examples/s]

### Candidate details / Job target:
Position: 13 years of exp || Solidity, C#, JavaScript || CTO / VP / Tech Lead Full stack
More info: Who am I:
- 13 years of commercial experience as a software engineer (web projects, customers from Europe and US);
- 5 years in roles of team lead, tech lead, architect (including coding);
- constant learner (books, courses, youtube);
- C1 (Advanced) level of English (IELTS General = 7/9);
- tech languages: C#, JavaScript, Solidity;
- ready to learn Rust/Go.

What can I bring in:
- develop your web/blockchain project from gathering requirements stage to deployment and maintenance;
- build a team of highly qualified and responsible professionals;
- build processes or improve existing ones;
- design architectures, code features, perform code reviews and so on.

What can I technically (in short):
- С#: 12 years of exp; .Net Core, .Net 6, MS SQL, EF, Clean Architecture, anything related to web platforms;
- JavaScript: 7 years of exp; Node.j

In [8]:
dataset[0]

{'messages': [{'content': "Below is an instruction that describes a task, paired with an input that provides candidate details and a target job. \nWrite a professional, ATS-friendly resume tailored to the target role. \n### Strategy:\n- Identify key skills from the candidate's past roles.\n- Match these skills to the job description keywords.\n- Prioritize experiences that show measurable achievements.\n\nFirst, produce a concise 2–4 bullet **Plan**. Then generate the resume. \nUse clear section headers. For experience bullets, use the STAR/impact style.\n\nOutput format:\nPlan:\n- <short bullet 1>\n- <short bullet 2>\n\nResume:\n[Use sections: Summary, Experience, Education, Skills, Projects/Certifications]",
   'role': 'system'},
  {'content': '### Candidate details / Job target:\nPosition: 13 years of exp || Solidity, C#, JavaScript || CTO / VP / Tech Lead Full stack\nMore info: Who am I:\r\n- 13 years of commercial experience as a software engineer (web projects, customers from Eur

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig

model_id = "google/gemma-3-4b-pt" # or `google/gemma-3-4b-pt`, `google/gemma-3-12b-pt`, `google/gemma-3-27b-pt`

if model_id == "google/gemma-3-4b-pt":
    model_class = AutoModelForCausalLM
else:
    model_class = AutoModelForImageTextToText

if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

model_kwargs = dict(
    attn_implementation="flash_attention_2", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch_dtype, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=model_kwargs['torch_dtype'],
    bnb_4bit_quant_storage=model_kwargs['torch_dtype'],
)

model = model_class.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    #modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

In [23]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-text-to-sql",         # directory to save and repository id
    max_seq_length=1024,                    # Reduced from 2048 - saves significant memory
    packing=True,                           # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    gradient_checkpointing_kwargs={"use_reentrant": False},  # More memory efficient
    optim="paged_adamw_8bit",              # use paged adamw optimizer for 8-bit
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,   # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    dataset_text_field="",                  # Will use messages field
    dataset_num_proc=4,                     # Parallel processing for dataset
)

In [24]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.45 GiB is free. Process 5906 has 13.29 GiB memory in use. Of the allocated memory 12.64 GiB is allocated by PyTorch, and 524.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

In [18]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [None]:
import torch
from transformers import pipeline

model_id = "gemma-text-to-sql"

# Load Model with PEFT adapter
model = model_class.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch_dtype,
  attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from random import randint
from transformers import pipeline

# 1. Setup the pipeline using your loaded model and tokenizer
# Note: We use the model/tokenizer you loaded in previous steps
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 2. Pick a random sample
# We check if 'dataset' is a dictionary (has splits) or a list
if hasattr(dataset, "keys") and "test" in dataset.keys():
    data_source = dataset["test"]
else:
    data_source = dataset # Fallback to using the training set if no test split exists

rand_idx = randint(0, len(data_source) - 1)
sample = data_source[rand_idx]

# 3. Prepare the Prompt (System + User message only)
messages = sample["messages"][:2] # We exclude the assistant's answer (index 2)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# 4. Generate the Resume
# Increased max_new_tokens to 1024 because resumes are long
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<end_of_turn>")
]

outputs = pipe(
    prompt,
    max_new_tokens=1024,
    do_sample=True,      # Set to True for more creative writing
    temperature=0.7,     # 0.7 is better for creative writing (0.1 is for code/math)
    top_k=50,
    top_p=0.9,
    eos_token_id=terminators,
)

# 5. Display Results
generated_text = outputs[0]['generated_text'][len(prompt):].strip()
user_input = messages[1]['content']
target_resume = sample["messages"][2]['content']

print(f"--- INPUT PROFILE ---\n{user_input}\n")
print(f"--- TARGET RESUME (Ground Truth) ---\n{target_resume[:500]}...\n[Truncated]\n")
print(f"--- GENERATED RESUME ---\n{generated_text}")