In [1]:
from data_utils import prepare_data_smart, load_data

In [2]:
_, _, _, _, _, _ = prepare_data_smart()

📂 Found existing data file, loading...
📂 Loading data from ./dataset.pkl...
✅ Data loaded! Sizes: Train=4000, Val=1000, Test=5000


In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = load_data()

📂 Loading data from ./dataset.pkl...
✅ Data loaded! Sizes: Train=4000, Val=1000, Test=5000


In [4]:
from common_utils import cleanup_gpu_memory

In [5]:
cleanup_gpu_memory()

🧹 Starting GPU memory cleanup...
   ℹ️  No model variables found to delete
   🗑️  Running garbage collection...
   🔥 Clearing CUDA cache...
   📊 GPU Memory Status:
      Total: 7.62 GB
      Allocated: 0.00 GB (0.0%)
      Reserved: 0.00 GB
      Available: 7.62 GB
   ✅ Memory successfully cleaned! Safe to load new models.


{'total': 7.62261962890625,
 'allocated': 0.0,
 'free': 7.62261962890625,
 'success': True}

In [6]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import BitsAndBytesConfig

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:
from embedding_utils import get_embeddings

#### BASELINE DISTILBERT ####

In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
base_model = AutoModel.from_pretrained(
    "distilbert-base-uncased",
    quantization_config=quantization_config,
    device_map="auto"
)

In [10]:
X_train_base_embed = get_embeddings(X_train, base_model, tokenizer, device, "baseline")
X_val_base_embed = get_embeddings(X_val, base_model, tokenizer, device, "baseline")
X_test_base_embed = get_embeddings(X_test, base_model, tokenizer, device, "baseline")

Extracting baseline embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:10<00:00, 46.65it/s]
Extracting baseline embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:02<00:00, 47.92it/s]
Extracting baseline embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:13<00:00, 47.34it/s]


In [11]:
xgb_base = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric="logloss"
)
xgb_base.fit(X_train_base_embed, y_train)

In [12]:
y_val_pred_base = xgb_base.predict(X_val_base_embed)
y_test_pred_base = xgb_base.predict(X_test_base_embed)

In [13]:
baseline_val_acc = accuracy_score(y_val, y_val_pred_base)*100
baseline_val_f1 = f1_score(y_val, y_val_pred_base)*100
baseline_test_acc = accuracy_score(y_test, y_test_pred_base)*100
baseline_test_f1 = f1_score(y_test, y_test_pred_base)*100

In [14]:
print(f"Baseline Validation - Accuracy: {baseline_val_acc:.4f}, F1: {baseline_val_f1:.4f}")
print(f"Baseline Test - Accuracy: {baseline_test_acc:.4f}, F1: {baseline_test_f1:.4f}")

Baseline Validation - Accuracy: 82.3000, F1: 82.4926
Baseline Test - Accuracy: 82.6200, F1: 82.4125


#### FINETUNE DISTILBERT ####

In [15]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [16]:
finetuning_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

In [18]:
train_dataset = SentimentDataset(train_encodings, y_train)
val_dataset = SentimentDataset(val_encodings, y_val)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [20]:
optimizer = AdamW(finetuning_model.parameters(), lr=2e-5, weight_decay=0.01)

In [21]:
num_epochs = 3
best_val_acc = 0
model_file_name = "best_finetuned_model_v2.pt"
finetuning_model.train()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [22]:
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Training
    total_loss = 0
    finetuning_model.train()
    train_loop = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
    
    for batch in train_loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = finetuning_model(**batch)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        train_loop.set_postfix(loss=loss.item())
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation
    finetuning_model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = finetuning_model(**batch)
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            val_preds.extend(predictions.cpu().numpy())
            val_true.extend(batch['labels'].cpu().numpy())
    
    val_acc = accuracy_score(val_true, val_preds)
    val_f1 = f1_score(val_true, val_preds)
    
    print(f"Epoch {epoch + 1} - Train Loss: {avg_train_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        # Save best model
        torch.save(finetuning_model.state_dict(), model_file_name)


Epoch 1/3


Training Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [02:04<00:00,  2.01it/s, loss=0.0654]
Validating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.37it/s]


Epoch 1 - Train Loss: 0.3391, Val Acc: 0.9050, Val F1: 0.9086

Epoch 2/3


Training Epoch 2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [02:04<00:00,  2.00it/s, loss=0.0962]
Validating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.37it/s]


Epoch 2 - Train Loss: 0.1655, Val Acc: 0.8660, Val F1: 0.8577

Epoch 3/3


Training Epoch 3: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [02:05<00:00,  2.00it/s, loss=0.00574]
Validating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.35it/s]


Epoch 3 - Train Loss: 0.0866, Val Acc: 0.9090, Val F1: 0.9157


In [23]:
finetuning_model.load_state_dict(torch.load(model_file_name))

<All keys matched successfully>

In [24]:
finetuned_base_model = finetuning_model.distilbert  

In [25]:
X_train_finetune_embed = get_embeddings(X_train, finetuned_base_model, tokenizer, device, "finetune-distilbert")
X_val_finetune_embed = get_embeddings(X_val, finetuned_base_model, tokenizer, device, "finetune-distilbert")
X_test_finetune_embed = get_embeddings(X_test, finetuned_base_model, tokenizer, device, "finetune-distilbert")

Extracting finetune-distilbert embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:37<00:00, 13.24it/s]
Extracting finetune-distilbert embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:09<00:00, 13.47it/s]
Extracting finetune-distilbert embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:47<00:00, 13.27it/s]


In [26]:
xgb_finetune = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric="logloss"
)
xgb_finetune.fit(X_train_finetune_embed, y_train)

In [27]:
y_val_pred_ft = xgb_finetune.predict(X_val_finetune_embed)
y_test_pred_ft = xgb_finetune.predict(X_test_finetune_embed)

In [28]:
ft_val_acc = accuracy_score(y_val, y_val_pred_ft) * 100
ft_val_f1 = f1_score(y_val, y_val_pred_ft) * 100
ft_test_acc = accuracy_score(y_test, y_test_pred_ft) * 100
ft_test_f1 = f1_score(y_test, y_test_pred_ft) * 100

In [29]:
# Final results comparison
print("\n" + "="*60)
print("FINAL RESULTS COMPARISON (DISTILBERT)")
print("="*60)
print("BASELINE (Pre-trained DistilBert + XGBoost):")
print(f"  Validation - Accuracy: {baseline_val_acc:.4f}, F1: {baseline_val_f1:.4f}")
print(f"  Test       - Accuracy: {baseline_test_acc:.4f}, F1: {baseline_test_f1:.4f}")
print()
print("FINE-TUNED (Fine-tuned DistilBert + XGBoost):")
print(f"  Validation - Accuracy: {ft_val_acc:.4f}, F1: {ft_val_f1:.4f}")
print(f"  Test       - Accuracy: {ft_test_acc:.4f}, F1: {ft_test_f1:.4f}")
print()
# print("IMPROVEMENT:")
# print(f"  Validation - Accuracy: {ft_val_acc - baseline_val_acc:+.4f}, F1: {ft_val_f1 - baseline_val_f1:+.4f}")
# print(f"  Test       - Accuracy: {ft_test_acc - baseline_test_acc:+.4f}, F1: {ft_test_f1 - baseline_test_f1:+.4f}")
print("="*60)


FINAL RESULTS COMPARISON (DISTILBERT)
BASELINE (Pre-trained DistilBert + XGBoost):
  Validation - Accuracy: 82.3000, F1: 82.4926
  Test       - Accuracy: 82.6200, F1: 82.4125

FINE-TUNED (Fine-tuned DistilBert + XGBoost):
  Validation - Accuracy: 91.2000, F1: 91.5709
  Test       - Accuracy: 89.8400, F1: 89.9366



#### BASELINE LLM 

In [None]:
cleanup_gpu_memory()

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from tqdm import tqdm
import numpy as np
import pandas as pd
import requests
import tarfile

In [None]:
# MODEL_NAME = "meta-llama/Llama-2-7b-hf"
# MODEL_NAME = "microsoft/DialoGPT-large"
MODEL_NAME = "Qwen/Qwen3-4B"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Add instruction prompt for LLaMA
        prompt = f"Analyze the sentiment of the following movie review. Classify as positive (1) or negative (0).\n\nReview: {text}\n\nSentiment:"
        
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Quantization config for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("Loading base base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

In [None]:
from embedding_utils import get_llm_embeddings

In [None]:
print("Extracting baseline embeddings...")
X_train_base_embed = get_llm_embeddings(X_train, base_model, tokenizer, DEVICE,"baseline-llm")
X_val_base_embed = get_llm_embeddings(X_val, base_model, tokenizer, DEVICE,"baseline-llm")
X_test_base_embed = get_llm_embeddings(X_test, base_model, tokenizer, DEVICE,"baseline-llm")

In [None]:
print("Training baseline XGBoost...")
xgb_base = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric="logloss"
)
xgb_base.fit(X_train_base_embed, y_train)

In [None]:
y_val_pred_base = xgb_base.predict(X_val_base_embed)
y_test_pred_base = xgb_base.predict(X_test_base_embed)

In [None]:
baseline_val_acc = accuracy_score(y_val, y_val_pred_base) * 100
baseline_val_f1 = f1_score(y_val, y_val_pred_base) * 100
baseline_test_acc = accuracy_score(y_test, y_test_pred_base) * 100
baseline_test_f1 = f1_score(y_test, y_test_pred_base) * 100

In [None]:
print(f"Baseline Validation - Accuracy: {baseline_val_acc:.4f}, F1: {baseline_val_f1:.4f}")
print(f"Baseline Test - Accuracy: {baseline_test_acc:.4f}, F1: {baseline_test_f1:.4f}")

#### FINETUNE LLM 

In [None]:
cleanup_gpu_memory()

In [None]:
def check_if_model_has_peft(model):
    """
    Check if model already has PEFT/LoRA adapters
    """
    # Check for PEFT attributes
    has_peft_config = hasattr(model, 'peft_config')
    has_peft_modules = hasattr(model, 'peft_modules')
    has_base_model = hasattr(model, 'base_model')
    
    # Check model class name
    is_peft_model = 'Peft' in model.__class__.__name__
    
    print(f"🔍 Model PEFT Status Check:")
    print(f"   Model class: {model.__class__.__name__}")
    print(f"   Has peft_config: {has_peft_config}")
    print(f"   Has peft_modules: {has_peft_modules}")
    print(f"   Has base_model: {has_base_model}")
    print(f"   Is PEFT model: {is_peft_model}")
    
    already_has_peft = any([has_peft_config, has_peft_modules, has_base_model, is_peft_model])
    
    if already_has_peft:
        print("   ⚠️  Model ALREADY has PEFT/LoRA adapters!")
        return True
    else:
        print("   ✅ Model is clean, ready for PEFT application")
        return False

In [None]:
def load_clean_model_for_lora(num_labels=2):
    """
    Load a completely clean model without any PEFT adapters
    Args:
        task_type: "classification" or "generation"
        num_labels: Number of classes for classification (default: 2 for sentiment)
    """
    print(f"🔄 Loading clean model for classification task...")
    
    # Quantization config
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model based on task type
    from transformers import AutoModelForSequenceClassification
    print(f"   📊 Loading for sequence classification with {num_labels} labels...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    
    # Prepare for training
    model = prepare_model_for_kbit_training(model)
    
    # Check if clean
    check_if_model_has_peft(model)
    
    return model, tokenizer

In [None]:
finetuning_model, tokenizer = load_clean_model_for_lora()

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "o_proj", "down_proj"],
    bias="none",
    fan_in_fan_out=False,
)
finetuning_model = get_peft_model(finetuning_model, lora_config)
finetuning_model.print_trainable_parameters()

In [None]:
# Create datasets
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)

In [None]:
# Custom trainer for sentiment classification
class SentimentTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Initialize classifier layer - will be created on first forward pass
        self.classifier = None
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Fixed compute_loss method that handles both old and new Transformers versions
        **kwargs handles any extra arguments like 'num_items_in_batch'
        """
        labels = inputs.pop("labels")
        
        # Forward pass through the model
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Get the last token's logits for classification
        batch_size = logits.shape[0]
        sequence_lengths = (inputs.get("attention_mask", torch.ones_like(inputs["input_ids"])) == 1).sum(dim=1) - 1
        last_token_logits = logits[range(batch_size), sequence_lengths]
        
        # Create classifier if it doesn't exist
        if self.classifier is None:
            self.classifier = nn.Linear(last_token_logits.shape[-1], 2).to(logits.device)
            print(f"Created classifier: {last_token_logits.shape[-1]} -> 2")
        
        # Project to 2 classes (positive/negative)
        classification_logits = self.classifier(last_token_logits)
        
        # Compute loss
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(classification_logits, labels)
        
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./qwen-models/",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    learning_rate=2e-4,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
    dataloader_pin_memory=False,
    fp16=True,
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = SentimentTrainer(
    model=finetuning_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [None]:
print("Starting fine-tuning...")
trainer.train()

In [None]:
print("Saving fine-tuned model...")
trainer.save_model("./finetuned")

In [None]:
cleanup_gpu_memory()

In [None]:
def load_finetuned_model_memory_efficient(model_path="./finetuned", base_model_name="Qwen/Qwen3-4B"):
    """
    Load model and re-apply quantization for memory efficiency
    """
    print("🔄 Loading fine-tuned model with memory optimization...")
    
    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    except:
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load base model with quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    
    print("📦 Loading base model with quantization...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=quantization_config,  # Re-apply quantization
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    
    # Load LoRA adapters
    print("🔧 Loading LoRA adapters...")
    from peft import PeftModel
    finetuned_model = PeftModel.from_pretrained(base_model, model_path)
    
    print("✅ Model loaded with quantization preserved!")
    
    # Check memory
    if torch.cuda.is_available():
        memory_gb = torch.cuda.memory_allocated() / 1024**3
        print(f"📊 GPU Memory: {memory_gb:.2f}GB")
    
    return finetuned_model, tokenizer

In [None]:
finetuned_base_model, tokenizer = load_finetuned_model_memory_efficient("./finetuned")

In [None]:
X_train_ft_embed = get_llm_embeddings(X_train, finetuned_base_model, tokenizer, DEVICE,"llm-fine-tuned")
X_val_ft_embed = get_llm_embeddings(X_val, finetuned_base_model, tokenizer, DEVICE,"llm-fine-tuned")
X_test_ft_embed = get_llm_embeddings(X_test, finetuned_base_model, tokenizer, DEVICE,"llm-fine-tuned")

In [None]:
print("Training XGBoost on fine-tuned embeddings...")
xgb_ft = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric="logloss"
)
xgb_ft.fit(X_train_ft_embed, y_train)

In [None]:
# Evaluate fine-tuned model
y_val_pred_ft = xgb_ft.predict(X_val_ft_embed)
y_test_pred_ft = xgb_ft.predict(X_test_ft_embed)

In [None]:
ft_val_acc = accuracy_score(y_val, y_val_pred_ft) * 100
ft_val_f1 = f1_score(y_val, y_val_pred_ft) * 100
ft_test_acc = accuracy_score(y_test, y_test_pred_ft) * 100
ft_test_f1 = f1_score(y_test, y_test_pred_ft) * 100

In [None]:
# Final results comparison
print("\n" + "="*60)
print("FINAL RESULTS COMPARISON (Qwen)")
print("="*60)
# print("BASELINE (Pre-trained Qwen + XGBoost):")
# print(f"  Validation - Accuracy: {baseline_val_acc:.4f}, F1: {baseline_val_f1:.4f}")
# print(f"  Test       - Accuracy: {baseline_test_acc:.4f}, F1: {baseline_test_f1:.4f}")
# print()
print("FINE-TUNED (Fine-tuned Qwen + XGBoost):")
print(f"  Validation - Accuracy: {ft_val_acc:.4f}, F1: {ft_val_f1:.4f}")
print(f"  Test       - Accuracy: {ft_test_acc:.4f}, F1: {ft_test_f1:.4f}")
print()
# print("IMPROVEMENT:")
# print(f"  Validation - Accuracy: {ft_val_acc - baseline_val_acc:+.4f}, F1: {ft_val_f1 - baseline_val_f1:+.4f}")
# print(f"  Test       - Accuracy: {ft_test_acc - baseline_test_acc:+.4f}, F1: {ft_test_f1 - baseline_test_f1:+.4f}")
# print("="*60)