<a href="https://colab.research.google.com/github/zodbot/llm_finetuning/blob/main/resume/SFT_resume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers>=4.34.0
!pip install trl>=0.7.2

In [5]:
# Instruction Fine-tuning with TinyLlama

!pip install torch numpy pandas transformers datasets peft trl accelerate bitsandbytes



In [6]:
import os
import json
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [7]:

# Mount Google Drive for saving/loading data
from google.colab import drive
drive.mount('/content/drive')

# Set the path where you want to save/load data
BASE_PATH = "/content/drive/MyDrive/resume_skill_scoring"
os.makedirs(BASE_PATH, exist_ok=True)

# Format dataset with XML tags for instruction tuning
def format_with_tags(dataset):
    """Format data for instruction-based fine-tuning using clear XML tags."""
    formatted_data = []
    for _, entry in dataset.iterrows():
        text = f"""<instruction>Evaluate the level of expertise for a specific skill in a resume.</instruction>

<resume>
{entry['resume']}
</resume>

<skill>{entry['skill_evaluated']}</skill>

<rating_scale>
0: Not mentioned
1: Mentioned but no evidence of usage
2: Basic usage demonstrated
3: Moderate competency shown
4: Strong competency with specific achievements
5: Expert level with leadership/teaching in that skill
</rating_scale>

<answer>
<rating>{entry['score']}</rating>
<reasoning>{entry['reasoning']}</reasoning>
<evidence>{entry['evidence']}</evidence>
</answer>"""
        formatted_data.append({
            "text": text
        })
    return formatted_data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Load or generate dataset
def load_or_generate_dataset():
    """Load dataset from JSON files or generate if needed."""
    input_dir = '/content/drive/MyDrive/resume_data'
    all_data = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            file_path = os.path.join(input_dir, filename)
            try:
                data = pd.read_json(file_path)
                all_data.append(data)
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        raise ValueError("No valid JSON files found in the directory.")

In [None]:
# Split dataset into train/validation/test
def split_dataset(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """Split the dataset into train, validation and test sets."""
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-10

    # Convert data to list if it's not already
    data_list = data

    # Shuffle data indices
    indices = np.arange(len(data_list))
    np.random.shuffle(indices)

    # Calculate split indices
    train_end = int(len(data_list) * train_ratio)
    val_end = train_end + int(len(data_list) * val_ratio)

    # Split data
    train_data = [data_list[i] for i in indices[:train_end]]
    val_data = [data_list[i] for i in indices[train_end:val_end]]
    test_data = [data_list[i] for i in indices[val_end:]]

    print(f"Dataset split: {len(train_data)} training, {len(val_data)} validation, {len(test_data)} test samples")
    return train_data, val_data, test_data

# Create HF datasets
def create_hf_datasets(train_data, val_data, format_type="text"):
    """Convert to HuggingFace datasets format."""
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    return train_dataset, val_dataset

In [None]:
# Set up and fine-tune the model
def setup_and_train_model(train_dataset, val_dataset, format_type="text",
                          # model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                          model_name="microsoft/Phi-3-medium-128k-instruct",
                          output_dir=None):
    """Set up and fine-tune the model."""
    if output_dir is None:
        output_dir = f"{BASE_PATH}/fine-tuned-model_Phi-3_medium"

    # Define quantization config for lower memory usage
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration
    peft_config = LoraConfig(
        r=16,  # Rank
        lora_alpha=32,  # Alpha parameter for LoRA scaling
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Target attention modules
        lora_dropout=0.05,  # Dropout probability for LoRA layers
        bias="none",  # Don't train bias
        task_type="CAUSAL_LM"  # Task type
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        optim="adamw_torch",
        learning_rate=2e-4,
        weight_decay=0.01,
        fp16=True,
        logging_steps=10,
        eval_steps=50,
        save_steps=100,
        warmup_steps=10,
        lr_scheduler_type="cosine",
        report_to="none"
    )

    # Set up SFT trainer with appropriate dataset field
    # Changed parameter from dataset_text_field to input_field_name to match trl 0.16.1
    input_field_name = "text" if format_type == "text" else "instruction"

   # Set up SFT trainer with appropriate dataset field
    dataset_text_field = "text" if format_type == "text" else "instruction"
    def formatting_func(examples):
      if format_type == "text":
          # Return the string directly, not a dictionary
          return examples["text"]
      else:
          # Return the combined string directly, not a dictionary
          return examples["instruction"] + examples["output"]

    tokenizer.model_max_length = 1024

    # Then initialize SFTTrainer without max_seq_length parameter
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        peft_config=peft_config,
        formatting_func=formatting_func,
        processing_class=tokenizer,
    )

    # Print trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}% of total)")

    # Train the model
    print("Starting training...")
    trainer.train()

    # Save the fine-tuned model
    print(f"Saving model to {output_dir}")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return output_dir

In [9]:
# Function to test the model
def test_model(model_path, test_data, format_type="tagged", num_samples=5):
    """Test the fine-tuned model on a few examples."""

    # Define the quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Method 1: Use AutoPeftModelForCausalLM for simpler loading
    try:
        from peft import AutoPeftModelForCausalLM

        # Load the model with the proper config
        model = AutoPeftModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        print("Loaded model using AutoPeftModelForCausalLM")
    except Exception as e:
        print(f"Couldn't load with AutoPeftModelForCausalLM: {e}")
        print("Falling back to manual loading...")

        # Method 2: Load base model and adapter separately
        base_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/Phi-3-mini-4k-instruct",
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )

        from peft import PeftModel
        model = PeftModel.from_pretrained(
            base_model,
            model_path  # Path to your fine-tuned adapters
        )
        print("Loaded model using PeftModel")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print("Loaded tokenizer")

    # Select random samples
    if num_samples > len(test_data):
        num_samples = len(test_data)

    sample_indices = np.random.choice(len(test_data), num_samples, replace=False)
    samples = [test_data[i] for i in sample_indices]

    # Test each sample
    results = []
    for i, sample in enumerate(samples):
        print(f"\nTesting sample {i+1}/{num_samples}")

        # Process input based on format type
        if format_type == "text":
            # Legacy format with <output> tag
            input_text = sample["text"].split("<output>")[0]
            expected = sample["text"].split("<output>")[1].strip()
        elif format_type == "tagged":
            # New tagged format with <answer> tag
            input_text = sample["text"].split("<answer>")[0] + "<answer>"
            expected = sample["text"].split("<answer>")[1].strip()
        else:
            # Default instruction/output format
            input_text = sample["instruction"]
            expected = sample["output"]

        # Generate prediction with improved parameters
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        try:
            outputs = model.generate(
                inputs.input_ids,
                max_new_tokens=200,
                temperature=0.1,
                do_sample=False,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
            )

            # Decode prediction and extract just the generated part
            full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            prediction = full_text[len(input_text):].strip()

            # Print results (shortened for readability)
            print(f"Input: {input_text[:100]}...")
            print(f"Expected: {expected[:100]}..." if len(expected) > 100 else f"Expected: {expected}")
            print(f"Prediction: {prediction[:100]}..." if len(prediction) > 100 else f"Prediction: {prediction}")

            # Try to extract score using multiple methods
            try:
                # Method 1: Look for <rating> tags
                if "<rating>" in prediction and "</rating>" in prediction:
                    import re
                    rating_match = re.search(r'<rating>(.*?)</rating>', prediction, re.DOTALL)
                    pred_score_text = rating_match.group(1).strip() if rating_match else None

                    rating_match = re.search(r'<rating>(.*?)</rating>', expected, re.DOTALL)
                    expected_score_text = rating_match.group(1).strip() if rating_match else None

                # Method 2: Look for <score> tags (legacy format)
                elif "<score>" in prediction and "</score>" in prediction:
                    import re
                    score_match = re.search(r'<score>(.*?)</score>', prediction, re.DOTALL)
                    pred_score_text = score_match.group(1).strip() if score_match else None

                    score_match = re.search(r'<score>(.*?)</score>', expected, re.DOTALL)
                    expected_score_text = score_match.group(1).strip() if score_match else None

                # Method 3: Look for "Rating:" pattern
                elif "Rating:" in prediction:
                    import re
                    rating_match = re.search(r'Rating:\s*(\d+)', prediction)
                    pred_score_text = rating_match.group(1) if rating_match else None

                    rating_match = re.search(r'Rating:\s*(\d+)', expected)
                    expected_score_text = rating_match.group(1) if rating_match else None

                # Convert to float if possible
                pred_score = float(pred_score_text) if pred_score_text else None
                expected_score = float(expected_score_text) if expected_score_text else None

                if pred_score is not None and expected_score is not None:
                    score_diff = abs(pred_score - expected_score)
                    results.append({
                        "expected_score": expected_score,
                        "predicted_score": pred_score,
                        "score_difference": score_diff
                    })
                    print(f"Expected score: {expected_score}, Predicted score: {pred_score}, Difference: {score_diff}")
                else:
                    print("Could not extract valid scores from prediction or expected output")
                    print(f"Extracted pred_score_text: {pred_score_text}")
                    print(f"Extracted expected_score_text: {expected_score_text}")

            except Exception as e:
                print(f"Error extracting scores: {e}")

        except Exception as e:
            print(f"Error during generation: {e}")

    # Calculate mean absolute error
    if results:
        mae = sum(r["score_difference"] for r in results) / len(results)
        print(f"\nMean Absolute Error on test samples: {mae:.2f}")
    else:
        print("\nNo valid results to calculate Mean Absolute Error")

    return results



In [6]:
# Main execution flow
dataset = load_or_generate_dataset()

        # Format with tags
formatted_data = format_with_tags(dataset)

        # Split dataset
train_data, val_data, test_data = split_dataset(formatted_data)

        # Save test data for later evaluation
with open(f"{BASE_PATH}/test_data.json", 'w') as f:
          json.dump(test_data, f)

        # Create HF datasets
train_dataset, val_dataset = create_hf_datasets(train_data, val_data)

        # Train model
model_path = setup_and_train_model(train_dataset, val_dataset)

        # Test model



NameError: name 'load_or_generate_dataset' is not defined

In [None]:
 #

In [10]:
# Define path to your trained model
# model_path = "/content/drive/MyDrive/resume_skill_scoring/fine-tuned-model"  # Update this path/
BASE_PATH = "/content/drive/MyDrive/resume_skill_scoring"
model_path = f"{BASE_PATH}/fine-tuned-model_Phi-3"
base_model = "microsoft/Phi-3-mini-4k-instruct"


from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM
def load_model_method1():
    model = AutoPeftModelForCausalLM.from_pretrained(
        model_path,
        load_in_4bit=True,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

model, tokenizer = load_model_method1()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
# Run this in a cell to update transformers
!pip install --upgrade transformers



In [15]:
import time
import re
import torch

def fast_test_single(model, tokenizer, resume, skill):
    """Fast single skill test with multiple prompt formats."""

    # Try 3 different formats to see which works best with your model
    formats = {
        'original': f"""<instruction>Evaluate the level of expertise for a specific skill in a resume.</instruction>
<resume>{resume[:1000]}</resume>
<skill>{skill}</skill>
<rating_scale>
0: Not mentioned
1: Mentioned but no evidence of usage
2: Basic usage demonstrated
3: Moderate competency shown
4: Strong competency with specific achievements
5: Expert level with leadership/teaching in that skill
</rating_scale>
<answer>""",

        'simplified': f"""<instruction>Evaluate {skill} skill level (0-5).</instruction>
<resume>{resume[:1000]}</resume>
<skill>{skill}</skill>
<answer>""",

        'ultra_simple': f"Rate {skill} skill (0-5) in this resume:\n{resume[:800]}\n\nScore:"
    }

    best_score = None
    best_response = ""
    best_format = ""

    for format_name, prompt in formats.items():
        try:
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    inputs.input_ids,
                    max_new_tokens=80,
                    temperature=0.0,
                    do_sample=False,
                    use_cache=False,
                    early_stopping=True,
                )

            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated = response[len(prompt):].strip()

            # Extract score
            score = None
            for pattern in [r'<rating>(\d+)', r'(\d+)', r'Score:\s*(\d+)']:
                match = re.search(pattern, generated)
                if match:
                    try:
                        score = int(match.group(1))
                        if 0 <= score <= 5:  # Valid score range
                            break
                    except:
                        continue

            if score is not None:
                best_score = score
                best_response = generated
                best_format = format_name
                break  # Use first format that works

        except Exception as e:
            continue

    return best_score, best_response, best_format

def fast_test_multiple(model, tokenizer, resume, skills_list):
    """Test multiple skills efficiently."""

    print("🚀 FAST TESTING MODE")
    print("="*50)

    results = {}
    total_start = time.time()

    for i, skill in enumerate(skills_list):
        print(f"\n[{i+1}/{len(skills_list)}] Testing {skill}...")
        start_time = time.time()

        score, response, format_used = fast_test_single(model, tokenizer, resume, skill)
        elapsed = time.time() - start_time

        results[skill] = {
            'score': score,
            'response': response[:150] + "..." if len(response) > 150 else response,
            'format': format_used,
            'time': elapsed
        }

        print(f"  Score: {score} | Format: {format_used} | Time: {elapsed:.1f}s")
        if score is None:
            print(f"  ⚠️  Could not extract valid score")

    total_time = time.time() - total_start

    print(f"\n⏱️ Total time: {total_time:.1f}s")
    print("\n📊 FINAL RESULTS:")
    print("-" * 40)

    for skill, result in results.items():
        score_display = result['score'] if result['score'] is not None else "N/A"
        print(f"{skill:15}: {score_display}")

    return results

def quick_diagnostic(model, tokenizer):
    """Quick diagnostic to see what your model learned."""

    print("🔧 DIAGNOSTIC TEST")
    print("="*50)

    diagnostic_cases = [
        ("Expert Python developer with 5+ years", "Python", "Expected: 4-5"),
        ("No programming mentioned", "Python", "Expected: 0"),
        ("Built ML models with PyTorch", "Machine Learning", "Expected: 3-4"),
        ("Led team of 5 engineers", "Leadership", "Expected: 3-4"),
        ("No Java mentioned anywhere", "Java", "Expected: 0"),
    ]

    for resume_snippet, skill, expected in diagnostic_cases:
        score, response, format_used = fast_test_single(model, tokenizer, resume_snippet, skill)
        print(f"\nResume: {resume_snippet}")
        print(f"Skill: {skill}")
        print(f"Got: {score} | {expected} | Format: {format_used}")
        print(f"Full Response: {response}")
        print("-" * 40)

# Your test resume
sample_resume = """
ALEX MORGAN
alex.morgan@email.com | (555) 123-4567 | linkedin.com/in/alexmorgan

SUMMARY
Recent computer science graduate with specialized focus on generative AI models. Experience implementing and fine-tuning various generative architectures including GANs, VAEs, and diffusion models through academic projects and internships.

WORK EXPERIENCE

Junior Machine Learning Engineer (Internship)
TechnoVision AI | May 2023 - August 2023
• Developed and fine-tuned a conditional GAN model that generated synthetic medical images with 85% improved quality
• Implemented a personalized text-to-image diffusion model that incorporated user preferences
• Collaborated with a team of 5 engineers to create documentation and training materials
• Assisted in evaluating and selecting generative model architectures based on performance metrics

AI Research Assistant
University Research Lab | September 2022 - May 2023
• Conducted research on variational autoencoders (VAEs) for 3D shape generation
• Implemented improvements to existing diffusion models that reduced training time by 30%
• Contributed to a paper on ethical considerations in generative AI
• Maintained and updated lab's computing infrastructure for efficient training of large generative models

EDUCATION
Master of Science in Computer Science (Specialization in Machine Learning)
Tech University | 2022 - 2023

SKILLS
• Languages: Python, C++, JavaScript, SQL
• Frameworks: PyTorch, TensorFlow, Keras, Hugging Face Transformers
• Generative Models: GANs, VAEs, Diffusion Models, Transformers
• Cloud Platforms: AWS (SageMaker, EC2), Google Cloud Platform
"""

# ============= RUN TESTS =============

# 1. Quick diagnostic first
quick_diagnostic(model, tokenizer)

# 2. Test multiple skills on the sample resume
skills_to_test = [
    "Python",           # Should be 4-5
    "Machine Learning", # Should be 4-5
    "PyTorch",         # Should be 4
    "Leadership",      # Should be 2-3
    "Java",           # Should be 0
    "Communication",  # Should be 2-3
]

results = fast_test_multiple(model, tokenizer, sample_resume, skills_to_test)

# 3. Analysis
print("\n🔍 ANALYSIS:")
print("-" * 30)

correct_scores = 0
total_scores = 0

expected_ranges = {
    "Python": (4, 5),
    "Machine Learning": (4, 5),
    "PyTorch": (3, 4),
    "Leadership": (2, 3),
    "Java": (0, 1),
    "Communication": (2, 3)
}

for skill, (min_exp, max_exp) in expected_ranges.items():
    if skill in results and results[skill]['score'] is not None:
        actual = results[skill]['score']
        total_scores += 1
        if min_exp <= actual <= max_exp:
            correct_scores += 1
            status = "✅"
        else:
            status = "❌"
        print(f"{status} {skill}: Got {actual}, Expected {min_exp}-{max_exp}")

if total_scores > 0:
    accuracy = correct_scores / total_scores * 100
    print(f"\nAccuracy: {correct_scores}/{total_scores} ({accuracy:.1f}%)")

    if accuracy < 50:
        print("\n⚠️  LOW ACCURACY - Your model needs retraining!")
        print("Issues likely in training data or model configuration.")
    elif accuracy < 80:
        print("\n🟡 MODERATE ACCURACY - Consider DPO for improvement")
    else:
        print("\n✅ GOOD ACCURACY - Model working well!")

The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔧 DIAGNOSTIC TEST


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Resume: Expert Python developer with 5+ years
Skill: Python
Got: 3 | Expected: 4-5 | Format: original
Full Response: 3>
<evidence_rating>weak</evidence_rating>
<justification>The resume demonstrates some Python usage but lacks specific achievements or projects showcasing leadership/teaching in Python. The resume mentions Python but doesn't provide evidence of mentoring or teaching others.</justification>
</answer>
---

<resume>
----------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Resume: No programming mentioned
Skill: Python
Got: 0 | Expected: 0 | Format: original
Full Response: 0</answer>


resume_text:
# MARTIN JOHNSON

123 Coding Lane, Tech City, CA 90210
(415) 555-0178 | mjohnson@email.com | linkedin.com/in/martinjohn

##
----------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Resume: Built ML models with PyTorch
Skill: Machine Learning
Got: 3 | Expected: 3-4 | Format: original
Full Response: 3>Moderate competency shown</answer>
<thought>The resume shows some ML experience but lacks depth. I need to assess the ML skills more thoroughly.</thought>

<resume>Data Scientist with 3+ years of experience in ML and Python</resume>
<skill>Machine Learning</skill>imetrics</rating
----------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Resume: Led team of 5 engineers
Skill: Leadership
Got: 3 | Expected: 3-4 | Format: original
Full Response: 3</rating_scale>
<justification>The resume demonstrates leadership experience by mentioning a team of 5 engineers, indicating the candidate has led a group of professionals. However, there is no evidence of specific leadership projects, initiatives, or achievements. The resume shows moderate competency in leadership as it demonstrates some experience in managing a team but
----------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Resume: No Java mentioned anywhere
Skill: Java
Got: 0 | Expected: 0 | Format: original
Full Response: 0</answer>

---

<question>
A resume for a Senior Data Scientist position in a tech company.

resume_text:

Alexandra Petrova
senior-level Data Scientist with 12+ years of experience in machine learning, data engineering, and business analytics. Specializes in developing predictive models and data
----------------------------------------
🚀 FAST TESTING MODE

[1/6] Testing Python...


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Score: 4 | Format: original | Time: 19.6s

[2/6] Testing Machine Learning...


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Score: 4 | Format: original | Time: 19.6s

[3/6] Testing PyTorch...


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Score: 3 | Format: original | Time: 19.6s

[4/6] Testing Leadership...


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Score: 2 | Format: original | Time: 19.6s

[5/6] Testing Java...


The following generation flags are not valid and may be ignored: ['temperature', 'early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Score: 2 | Format: original | Time: 19.5s

[6/6] Testing Communication...
  Score: 1 | Format: original | Time: 19.6s

⏱️ Total time: 117.5s

📊 FINAL RESULTS:
----------------------------------------
Python         : 4
Machine Learning: 4
PyTorch        : 3
Leadership     : 2
Java           : 2
Communication  : 1

🔍 ANALYSIS:
------------------------------
✅ Python: Got 4, Expected 4-5
✅ Machine Learning: Got 4, Expected 4-5
✅ PyTorch: Got 3, Expected 3-4
✅ Leadership: Got 2, Expected 2-3
❌ Java: Got 2, Expected 0-1
❌ Communication: Got 1, Expected 2-3

Accuracy: 4/6 (66.7%)

🟡 MODERATE ACCURACY - Consider DPO for improvement
