## Step 1: Import necessary modules and load configurations

# BEAUTY, CHATGPT DATA

In [1]:
import sys
import json
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq, 
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training
from datasets import Dataset

# Set environment variable for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load configuration settings
from config import TOKENIZER_PATH, MODEL_PATH, PIPELINE_PARAMS, QLORA_PARAMS, ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS

# Verification
print("Configuration Loaded:")
print("Tokenizer Path:", TOKENIZER_PATH)
print("Model Path:", MODEL_PATH)
print("Pipeline Parameters:", PIPELINE_PARAMS)
print("QLoRA Parameters:", QLORA_PARAMS)
print("Prompt Template:", ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS)

  from .autonotebook import tqdm as notebook_tqdm


Configuration Loaded:
Tokenizer Path: models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct
Model Path: models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct
Pipeline Parameters: {'max_length': 2048, 'num_return_sequences': 1, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.95, 'repetition_penalty': 1.2}
QLoRA Parameters: {'lora_r': 8, 'lora_alpha': 8, 'lora_dropout': 0.01, 'lora_target_modules': ['q_proj', 'v_proj'], 'gradient_accumulation_steps': 2, 'lora_num_epochs': 2, 'lora_val_iterations': 100, 'lora_early_stopping_patience': 10, 'lora_lr': 0.0001, 'lora_micro_batch_size': 1}
Prompt Template: {'instruction': "### Instruction:\n You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and ma

## Step 2: Load and verify training data

In [2]:

# Load the training data
data_path = "QLoRa_finetuning/matching_ids_chatGPT.json"
with open(data_path, "r") as file:
    training_data = json.load(file)
    
#torch.cuda.empty_cache()

# Verify data structure
print("Data Structure Verification:")
for i, sample in enumerate(training_data[:2]):
    assert "User_ID" in sample, f"User_ID missing in sample {i}"
    assert "User_Profile" in sample, f"User_Profile missing in sample {i}"
    assert "Candidate_Items" in sample, f"Candidate_Items missing in sample {i}"
print("Data verification successful!")

Data Structure Verification:
Data verification successful!


## Step 3: Initialize the Tokenizer and Model with Quantization

In [3]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS if not already set

# Set 4-bit quantization configuration for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # Use NormalFloat4 for better memory efficiency
    bnb_4bit_use_double_quant=True  # Double quantization for more memory saving
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically maps layers to available GPU memory
)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.08s/it]


## Step 4: Preprocessing Function to Match Reviews with Profiles

In [4]:
# Preprocess function to format the data for candidate item generation
def preprocess_function(profile_sample):
    # Use the user profile as input
    input_text = ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['input'].replace(
        "{user_profile}", profile_sample["User_Profile"]
    )
    
    # Set up the output text (Candidate Items) as the expected response
    output_text = "\n".join(
        [f"{i + 1}. {item}" for i, item in enumerate(profile_sample["Candidate_Items"].values())]
    )
    
    # Format the complete prompt for training
    full_text = f"### Instruction:\n{ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['instruction']}\n\n{input_text}\n\n### Response:\nCandidate Item Categories:\n{output_text}"
    return full_text

# Verify preprocessing
print("Preprocessed Sample:", preprocess_function(training_data[10]))

Preprocessed Sample: ### Instruction:
### Instruction:
 You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and maximize satisfaction. 

### Input 
 User Profile: 
 "Short-Term Interests": The user has recently engaged with beauty products focusing on eye makeup and nail care. Specifically, they have reviewed mascaras that enhance lash length and volume, gel nail polish sets with unique colors, nail lamps, and hair styling tools.
"Long-Term Preferences": An analysis of the user's reviews reveals consistent themes:
* Interest in makeup products that enhance appearance, particularly eyes and nails
* Preference for unique and high-quality nail polishes, including indie-style and seasonal colors
* Appreciation for effecti

## Step 5: Tokenize and Prepare Data

In [5]:
# Tokenization function
def tokenize_function(sample):
    processed_text = preprocess_function(sample)
    tokenized = tokenizer(
        processed_text,
        truncation=True,
        max_length=PIPELINE_PARAMS['max_length'],
        padding="max_length",
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()  # Set labels identical to input_ids
    return tokenized

# LoRA Configuration
lora_config = LoraConfig(
    r=QLORA_PARAMS['lora_r'],
    lora_alpha=QLORA_PARAMS['lora_alpha'],
    lora_dropout=QLORA_PARAMS['lora_dropout'],
    target_modules=QLORA_PARAMS['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)

## Step 6: Configure LoRA and Training Parameters

In [6]:
# Apply LoRA configuration to the model
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model = get_peft_model(model, lora_config)
# Training sizes
training_sizes = [16,32,64]

# Loop through different training sizes
for train_size in training_sizes:
    # Split the dataset
    train_data = training_data[:train_size]
    eval_data = training_data[train_size:train_size + int(0.2 * train_size)]  # 20% of training data for evaluation

    # Tokenize datasets
    train_tokenized_data = [tokenize_function(sample) for sample in train_data]
    eval_tokenized_data = [tokenize_function(sample) for sample in eval_data]

    # Convert tokenized data to Dataset format
    train_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in train_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in train_tokenized_data],
        "labels": [x["labels"][0] for x in train_tokenized_data]
    })
    eval_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in eval_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in eval_tokenized_data],
        "labels": [x["labels"][0] for x in eval_tokenized_data]
    })

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"outputs/adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_chatGPT_data",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=QLORA_PARAMS['gradient_accumulation_steps'],
        num_train_epochs=QLORA_PARAMS['lora_num_epochs'],
        evaluation_strategy="steps",
        eval_steps=QLORA_PARAMS['lora_val_iterations'],
        save_steps=QLORA_PARAMS['lora_val_iterations'],
        logging_steps=10,
        learning_rate=QLORA_PARAMS['lora_lr'],
        save_total_limit=2,
        load_best_model_at_end=False,
        dataloader_pin_memory=False,
        report_to="none",
        fp16=True
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, padding=True)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )

    # Clear GPU cache before training
    torch.cuda.empty_cache()

    # Start training
    print(f"Starting training with {train_size} samples.")
    trainer.train()
    adapter_name = f"adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_chatGPT_data"
    # Save the model and tokenizer in separate directories for each training size
    model.save_pretrained(f"outputs/{adapter_name}")
    tokenizer.save_pretrained(f"outputs/{adapter_name}")
    print(f"Model trained with {train_size} samples saved to outputs/{adapter_name}")



Starting training with 16 samples.


  0%|          | 0/16 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
 62%|██████▎   | 10/16 [00:47<00:28,  4.76s/it]

{'loss': 9.9628, 'grad_norm': 9.679896354675293, 'learning_rate': 5.6250000000000005e-05, 'epoch': 1.25}


100%|██████████| 16/16 [01:16<00:00,  4.79s/it]


{'train_runtime': 76.719, 'train_samples_per_second': 0.417, 'train_steps_per_second': 0.209, 'train_loss': 9.305109024047852, 'epoch': 2.0}
Model trained with 16 samples saved to outputs/adapter_test_candidate_items_epoch_2_16_chatGPT_data
Starting training with 32 samples.


 31%|███▏      | 10/32 [00:47<01:44,  4.75s/it]

{'loss': 6.3056, 'grad_norm': 15.756006240844727, 'learning_rate': 7.8125e-05, 'epoch': 0.62}


 62%|██████▎   | 20/32 [01:35<00:57,  4.75s/it]

{'loss': 1.8001, 'grad_norm': 4.142382621765137, 'learning_rate': 4.6875e-05, 'epoch': 1.25}


 94%|█████████▍| 30/32 [02:22<00:09,  4.75s/it]

{'loss': 0.4623, 'grad_norm': 0.32388541102409363, 'learning_rate': 1.5625e-05, 'epoch': 1.88}


100%|██████████| 32/32 [02:32<00:00,  4.75s/it]


{'train_runtime': 152.1008, 'train_samples_per_second': 0.421, 'train_steps_per_second': 0.21, 'train_loss': 2.70556197501719, 'epoch': 2.0}
Model trained with 32 samples saved to outputs/adapter_test_candidate_items_epoch_2_32_chatGPT_data




Starting training with 64 samples.


 16%|█▌        | 10/64 [00:47<04:15,  4.74s/it]

{'loss': 0.4928, 'grad_norm': 0.2193705141544342, 'learning_rate': 8.4375e-05, 'epoch': 0.31}


 31%|███▏      | 20/64 [01:34<03:28,  4.73s/it]

{'loss': 0.4447, 'grad_norm': 0.1848074346780777, 'learning_rate': 6.875e-05, 'epoch': 0.62}


 47%|████▋     | 30/64 [02:22<02:41,  4.74s/it]

{'loss': 0.4363, 'grad_norm': 0.27242138981819153, 'learning_rate': 5.3125000000000004e-05, 'epoch': 0.94}


 62%|██████▎   | 40/64 [03:09<01:53,  4.75s/it]

{'loss': 0.3588, 'grad_norm': 0.2413753867149353, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.25}


 78%|███████▊  | 50/64 [03:57<01:06,  4.73s/it]

{'loss': 0.3682, 'grad_norm': 0.293459951877594, 'learning_rate': 2.1875e-05, 'epoch': 1.56}


 94%|█████████▍| 60/64 [04:44<00:18,  4.72s/it]

{'loss': 0.3227, 'grad_norm': 0.2635168731212616, 'learning_rate': 6.25e-06, 'epoch': 1.88}


100%|██████████| 64/64 [05:03<00:00,  4.74s/it]


{'train_runtime': 303.3239, 'train_samples_per_second': 0.422, 'train_steps_per_second': 0.211, 'train_loss': 0.40048264153301716, 'epoch': 2.0}
Model trained with 64 samples saved to outputs/adapter_test_candidate_items_epoch_2_64_chatGPT_data


# BEAUTY PIPELINE

In [1]:
#STEP 1
import sys
import json
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq, 
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training
from datasets import Dataset

# Set environment variable for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load configuration settings
from config import TOKENIZER_PATH, MODEL_PATH, PIPELINE_PARAMS, QLORA_PARAMS, ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS

# Verification
print("Configuration Loaded:")
print("Tokenizer Path:", TOKENIZER_PATH)
print("Model Path:", MODEL_PATH)
print("Pipeline Parameters:", PIPELINE_PARAMS)
print("QLoRA Parameters:", QLORA_PARAMS)
print("Prompt Template:", ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS)

# STEP 2

# Load the training data
data_path = "QLoRa_finetuning/updated_matching_ids_pipeline.json"
with open(data_path, "r") as file:
    training_data = json.load(file)
    
torch.cuda.empty_cache()

# Verify data structure
print("Data Structure Verification:")
for i, sample in enumerate(training_data[:2]):
    assert "User_ID" in sample, f"User_ID missing in sample {i}"
    assert "User_Profile" in sample, f"User_Profile missing in sample {i}"
    assert "Candidate_Items" in sample, f"Candidate_Items missing in sample {i}"
print("Data verification successful!")

# STEP 3
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS if not already set

# Set 4-bit quantization configuration for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # Use NormalFloat4 for better memory efficiency
    bnb_4bit_use_double_quant=True  # Double quantization for more memory saving
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically maps layers to available GPU memory
)

# Step 4
# Preprocess function to format the data for candidate item generation
def preprocess_function(profile_sample):
    # Use the user profile as input
    input_text = ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['input'].replace(
        "{user_profile}", profile_sample["User_Profile"]
    )
    
    # Set up the output text (Candidate Items) as the expected response
    output_text = "\n".join(
        [f"{i + 1}. {item}" for i, item in enumerate(profile_sample["Candidate_Items"].values())]
    )
    
    # Format the complete prompt for training
    full_text = f"### Instruction:\n{ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['instruction']}\n\n{input_text}\n\n### Response:\nCandidate Item Categories:\n{output_text}"
    return full_text

# Verify preprocessing
print("Preprocessed Sample:", preprocess_function(training_data[10]))
# Step 5
# Tokenization function
def tokenize_function(sample):
    processed_text = preprocess_function(sample)
    tokenized = tokenizer(
        processed_text,
        truncation=True,
        max_length=PIPELINE_PARAMS['max_length'],
        padding="max_length",
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()  # Set labels identical to input_ids
    return tokenized

# LoRA Configuration
lora_config = LoraConfig(
    r=QLORA_PARAMS['lora_r'],
    lora_alpha=QLORA_PARAMS['lora_alpha'],
    lora_dropout=QLORA_PARAMS['lora_dropout'],
    target_modules=QLORA_PARAMS['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)

# Step 6

# Apply LoRA configuration to the model
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model = get_peft_model(model, lora_config)
# Training sizes
training_sizes = [64]

# Loop through different training sizes
for train_size in training_sizes:
    # Split the dataset
    train_data = training_data[:train_size]
    eval_data = training_data[train_size:train_size + int(0.2 * train_size)]  # 20% of training data for evaluation

    # Tokenize datasets
    train_tokenized_data = [tokenize_function(sample) for sample in train_data]
    eval_tokenized_data = [tokenize_function(sample) for sample in eval_data]

    # Convert tokenized data to Dataset format
    train_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in train_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in train_tokenized_data],
        "labels": [x["labels"][0] for x in train_tokenized_data]
    })
    eval_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in eval_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in eval_tokenized_data],
        "labels": [x["labels"][0] for x in eval_tokenized_data]
    })

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"outputs/adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_pipeline_data",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=QLORA_PARAMS['gradient_accumulation_steps'],
        num_train_epochs=QLORA_PARAMS['lora_num_epochs'],
        evaluation_strategy="steps",
        eval_steps=QLORA_PARAMS['lora_val_iterations'],
        save_steps=QLORA_PARAMS['lora_val_iterations'],
        logging_steps=10,
        learning_rate=QLORA_PARAMS['lora_lr'],
        save_total_limit=2,
        load_best_model_at_end=False,
        dataloader_pin_memory=False,
        report_to="none",
        fp16=True
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, padding=True)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )

    # Clear GPU cache before training
    torch.cuda.empty_cache()

    # Start training
    print(f"Starting training with {train_size} samples.")
    trainer.train()
    adapter_name = f"adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_pipeline_data"
    # Save the model and tokenizer in separate directories for each training size
    model.save_pretrained(f"outputs/{adapter_name}")
    tokenizer.save_pretrained(f"outputs/{adapter_name}")
    print(f"Model trained with {train_size} samples saved to outputs/{adapter_name}")

  from .autonotebook import tqdm as notebook_tqdm


Configuration Loaded:
Tokenizer Path: models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct
Model Path: models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct
Pipeline Parameters: {'max_length': 2048, 'num_return_sequences': 1, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.95, 'repetition_penalty': 1.2}
QLoRA Parameters: {'lora_r': 8, 'lora_alpha': 8, 'lora_dropout': 0.01, 'lora_target_modules': ['q_proj', 'v_proj'], 'gradient_accumulation_steps': 2, 'lora_num_epochs': 2, 'lora_val_iterations': 100, 'lora_early_stopping_patience': 10, 'lora_lr': 0.0001, 'lora_micro_batch_size': 1}
Prompt Template: {'instruction': "### Instruction:\n You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and ma

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


Preprocessed Sample: ### Instruction:
### Instruction:
 You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and maximize satisfaction. 

### Input 
 User Profile: 
 "Short-Term Interests": Based on the provided review sequence, we can identify some immediate trends and preferences:
* Interest in hydrating products, particularly those focused on addressing dull, tired, dry, and sensitive skin types.
* Preference for serums and oils that offer moisturizing benefits without leaving behind residue.
* Appreciation for gentle, non-drying textures in cleansing products.
* Desire for exfoliating and purifying properties in daily skincare routines.
"Long-Term Preferences": Analyzing the user's entire history reveals deeper, mo



Starting training with 64 samples.


  0%|          | 0/64 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
 16%|█▌        | 10/64 [00:47<04:16,  4.75s/it]

{'loss': 8.4427, 'grad_norm': nan, 'learning_rate': 8.90625e-05, 'epoch': 0.31}


 31%|███▏      | 20/64 [01:35<03:28,  4.74s/it]

{'loss': 5.6316, 'grad_norm': 17.751296997070312, 'learning_rate': 7.34375e-05, 'epoch': 0.62}


 47%|████▋     | 30/64 [02:22<02:41,  4.74s/it]

{'loss': 2.0807, 'grad_norm': 5.407251358032227, 'learning_rate': 5.9375e-05, 'epoch': 0.94}


 62%|██████▎   | 40/64 [03:09<01:53,  4.73s/it]

{'loss': 0.5756, 'grad_norm': 0.3465035557746887, 'learning_rate': 4.375e-05, 'epoch': 1.25}


 78%|███████▊  | 50/64 [03:57<01:06,  4.74s/it]

{'loss': 0.5287, 'grad_norm': 0.24732546508312225, 'learning_rate': 2.8125000000000003e-05, 'epoch': 1.56}


 94%|█████████▍| 60/64 [04:44<00:18,  4.74s/it]

{'loss': 0.5075, 'grad_norm': 0.2507157325744629, 'learning_rate': 1.25e-05, 'epoch': 1.88}


100%|██████████| 64/64 [05:03<00:00,  4.74s/it]


{'train_runtime': 303.5524, 'train_samples_per_second': 0.422, 'train_steps_per_second': 0.211, 'train_loss': 2.807116910815239, 'epoch': 2.0}
Model trained with 64 samples saved to outputs/adapter_test_candidate_items_epoch_2_64_pipeline_data


# VIDEO GAMES CHATGPT

In [1]:
#STEP 1
import sys
import json
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq, 
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training
from datasets import Dataset

# Set environment variable for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load configuration settings
from config import TOKENIZER_PATH, MODEL_PATH, PIPELINE_PARAMS, QLORA_PARAMS, ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS

# Verification
print("Configuration Loaded:")
print("Tokenizer Path:", TOKENIZER_PATH)
print("Model Path:", MODEL_PATH)
print("Pipeline Parameters:", PIPELINE_PARAMS)
print("QLoRA Parameters:", QLORA_PARAMS)
print("Prompt Template:", ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS)

# STEP 2

# Load the training data
data_path = "QLoRa_finetuning/chatGPT_UP_output_video_games.json"
with open(data_path, "r") as file:
    training_data = json.load(file)
    
torch.cuda.empty_cache()

# Verify data structure
print("Data Structure Verification:")
for i, sample in enumerate(training_data[:2]):
    assert "User_ID" in sample, f"User_ID missing in sample {i}"
    assert "User_Profile" in sample, f"User_Profile missing in sample {i}"
    assert "Candidate_Items" in sample, f"Candidate_Items missing in sample {i}"
print("Data verification successful!")

# STEP 3
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS if not already set

# Set 4-bit quantization configuration for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # Use NormalFloat4 for better memory efficiency
    bnb_4bit_use_double_quant=True  # Double quantization for more memory saving
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically maps layers to available GPU memory
)

# Step 4
# Preprocess function to format the data for candidate item generation
def preprocess_function(profile_sample):
    # Use the user profile as input
    input_text = ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['input'].replace(
        "{user_profile}", profile_sample["User_Profile"]
    )
    
    # Set up the output text (Candidate Items) as the expected response
    output_text = "\n".join(
        [f"{i + 1}. {item}" for i, item in enumerate(profile_sample["Candidate_Items"].values())]
    )
    
    # Format the complete prompt for training
    full_text = f"### Instruction:\n{ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['instruction']}\n\n{input_text}\n\n### Response:\nCandidate Item Categories:\n{output_text}"
    return full_text

# Verify preprocessing
print("Preprocessed Sample:", preprocess_function(training_data[10]))
# Step 5
# Tokenization function
def tokenize_function(sample):
    processed_text = preprocess_function(sample)
    tokenized = tokenizer(
        processed_text,
        truncation=True,
        max_length=PIPELINE_PARAMS['max_length'],
        padding="max_length",
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()  # Set labels identical to input_ids
    return tokenized

# LoRA Configuration
lora_config = LoraConfig(
    r=QLORA_PARAMS['lora_r'],
    lora_alpha=QLORA_PARAMS['lora_alpha'],
    lora_dropout=QLORA_PARAMS['lora_dropout'],
    target_modules=QLORA_PARAMS['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)

# Step 6

# Apply LoRA configuration to the model
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model = get_peft_model(model, lora_config)
# Training sizes
training_sizes = [16,32,64]

# Loop through different training sizes
for train_size in training_sizes:
    # Split the dataset
    train_data = training_data[:train_size]
    eval_data = training_data[train_size:train_size + int(0.2 * train_size)]  # 20% of training data for evaluation

    # Tokenize datasets
    train_tokenized_data = [tokenize_function(sample) for sample in train_data]
    eval_tokenized_data = [tokenize_function(sample) for sample in eval_data]

    # Convert tokenized data to Dataset format
    train_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in train_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in train_tokenized_data],
        "labels": [x["labels"][0] for x in train_tokenized_data]
    })
    eval_dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"][0] for x in eval_tokenized_data],
        "attention_mask": [x["attention_mask"][0] for x in eval_tokenized_data],
        "labels": [x["labels"][0] for x in eval_tokenized_data]
    })

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"outputs/adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_chatgpt_data_video_games",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=QLORA_PARAMS['gradient_accumulation_steps'],
        num_train_epochs=QLORA_PARAMS['lora_num_epochs'],
        evaluation_strategy="steps",
        eval_steps=QLORA_PARAMS['lora_val_iterations'],
        save_steps=QLORA_PARAMS['lora_val_iterations'],
        logging_steps=10,
        learning_rate=QLORA_PARAMS['lora_lr'],
        save_total_limit=2,
        load_best_model_at_end=False,
        dataloader_pin_memory=False,
        report_to="none",
        fp16=True
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, padding=True)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )

    # Clear GPU cache before training
    torch.cuda.empty_cache()

    # Start training
    print(f"Starting training with {train_size} samples.")
    trainer.train()
    adapter_name = f"adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_chatgpt_data_video_games"
    # Save the model and tokenizer in separate directories for each training size
    model.save_pretrained(f"outputs/{adapter_name}")
    tokenizer.save_pretrained(f"outputs/{adapter_name}")
    print(f"Model trained with {train_size} samples saved to outputs/{adapter_name}")

  from .autonotebook import tqdm as notebook_tqdm


Configuration Loaded:
Tokenizer Path: models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct
Model Path: models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct
Pipeline Parameters: {'max_length': 2048, 'num_return_sequences': 1, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.95, 'repetition_penalty': 1.2}
QLoRA Parameters: {'lora_r': 8, 'lora_alpha': 8, 'lora_dropout': 0.01, 'lora_target_modules': ['q_proj', 'v_proj'], 'gradient_accumulation_steps': 2, 'lora_num_epochs': 2, 'lora_val_iterations': 100, 'lora_early_stopping_patience': 10, 'lora_lr': 0.0001, 'lora_micro_batch_size': 1}
Prompt Template: {'instruction': "### Instruction:\n You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and ma

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


Preprocessed Sample: ### Instruction:
### Instruction:
 You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and maximize satisfaction. 

### Input 
 User Profile: 
 "Short-Term Interests": The user has recently engaged with a variety of gaming peripherals—most notably keyboards, clicky switches, headsets, and word-based board games. They pay close attention to how products ‘feel’ (key switches, comfort levels) and whether instructions or compatibility details are clear.
"Long-Term Preferences": An analysis of the user’s reviews reveals consistent themes:
* Highly values how a keyboard or headset feels and performs during longer sessions
* Looks for clarity in instructions and easy-to-use function keys or hotkeys
* App



Starting training with 16 samples.


  0%|          | 0/16 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
 62%|██████▎   | 10/16 [00:47<00:28,  4.75s/it]

{'loss': 10.0916, 'grad_norm': 9.3617525100708, 'learning_rate': 5.6250000000000005e-05, 'epoch': 1.25}


100%|██████████| 16/16 [01:16<00:00,  4.76s/it]


{'train_runtime': 76.2063, 'train_samples_per_second': 0.42, 'train_steps_per_second': 0.21, 'train_loss': 9.397164583206177, 'epoch': 2.0}
Model trained with 16 samples saved to outputs/adapter_test_candidate_items_epoch_2_16_chatgpt_data_video_games




Starting training with 32 samples.


 31%|███▏      | 10/32 [00:47<01:44,  4.75s/it]

{'loss': 6.0998, 'grad_norm': 15.630152702331543, 'learning_rate': 7.8125e-05, 'epoch': 0.62}


 62%|██████▎   | 20/32 [01:35<00:57,  4.76s/it]

{'loss': 1.8654, 'grad_norm': 4.487438201904297, 'learning_rate': 4.6875e-05, 'epoch': 1.25}


 94%|█████████▍| 30/32 [02:22<00:09,  4.75s/it]

{'loss': 0.5734, 'grad_norm': 0.3956744372844696, 'learning_rate': 1.5625e-05, 'epoch': 1.88}


100%|██████████| 32/32 [02:32<00:00,  4.75s/it]


{'train_runtime': 152.1393, 'train_samples_per_second': 0.421, 'train_steps_per_second': 0.21, 'train_loss': 2.7007004395127296, 'epoch': 2.0}
Model trained with 32 samples saved to outputs/adapter_test_candidate_items_epoch_2_32_chatgpt_data_video_games




Starting training with 64 samples.


 16%|█▌        | 10/64 [00:47<04:16,  4.75s/it]

{'loss': 0.5299, 'grad_norm': 0.20251959562301636, 'learning_rate': 8.4375e-05, 'epoch': 0.31}


 31%|███▏      | 20/64 [01:35<03:29,  4.75s/it]

{'loss': 0.481, 'grad_norm': 0.1976896971464157, 'learning_rate': 6.875e-05, 'epoch': 0.62}


 47%|████▋     | 30/64 [02:22<02:41,  4.75s/it]

{'loss': 0.4299, 'grad_norm': 0.24011029303073883, 'learning_rate': 5.3125000000000004e-05, 'epoch': 0.94}


 62%|██████▎   | 40/64 [03:09<01:53,  4.73s/it]

{'loss': 0.389, 'grad_norm': 0.27251380681991577, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.25}


 78%|███████▊  | 50/64 [03:57<01:06,  4.73s/it]

{'loss': 0.3725, 'grad_norm': 0.33729416131973267, 'learning_rate': 2.1875e-05, 'epoch': 1.56}


 94%|█████████▍| 60/64 [04:44<00:19,  4.78s/it]

{'loss': 0.3438, 'grad_norm': 0.30028724670410156, 'learning_rate': 6.25e-06, 'epoch': 1.88}


100%|██████████| 64/64 [05:04<00:00,  4.75s/it]


{'train_runtime': 304.2866, 'train_samples_per_second': 0.421, 'train_steps_per_second': 0.21, 'train_loss': 0.4190325606614351, 'epoch': 2.0}
Model trained with 64 samples saved to outputs/adapter_test_candidate_items_epoch_2_64_chatgpt_data_video_games


: 

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
from transformers import BitsAndBytesConfig
from config import *
torch.cuda.empty_cache()


TOKENIZER_PATH = "models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_PATH = "models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct"
# Load the base model and tokenizer
base_model_path ="models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    #quantization_config=bnb_config,
    device_map="auto",
    #torch_dtype=torch.float16,
)

# Load the adapter
adapter_path = f"outputs/adapter_test_candidate_items_epoch_{QLORA_PARAMS['lora_num_epochs']}_{train_size}_chatGPT_data"
adapter_name = "candidate_items"
model = PeftModel.from_pretrained(model, adapter_path, adapter_name=adapter_name)

# Set the active adapter
model.set_adapter(adapter_name)
model.eval()

# Define the text generation function
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=PIPELINE_PARAMS['max_length'],
        do_sample=True,
        temperature=PIPELINE_PARAMS['temperature'],
        top_k=PIPELINE_PARAMS['top_k'],
        top_p=PIPELINE_PARAMS['top_p'],
        repetition_penalty=PIPELINE_PARAMS['repetition_penalty'],
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Define a sample user profile input for testing
user_profile = """
- Short-term Intentions: Looking for high-quality tech accessories.
- Long-term Preferences: Prefers durable, high-performance gadgets.
- User Profile: The users likes technical stuff with preferences for windows laptops
"""

# Format the prompt
prompt = (
    ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['instruction'] + "\n\n" +
    ALPACA_LORA_PROMPTS_CANDIDATE_ITEMS['input'].replace("{user_profile}", user_profile)+ "\n" + "### Response"
)

# Generate candidate items using the model
generated_text = generate_text(prompt)

# Display the output
print("Generated Candidate Items:")
print(generated_text)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
  adapters_weights = torch.load(filename, map_location=torch.device(device))


Generated Candidate Items:
### Instruction:
 You are a recommender system specialized. Based on the following user profile text, generate a list of general candidate item categories that align with the user's preferences and interests. Approach this task by treating these categories as a cohesive set, ensuring that they collectively reflect the user’s overall profile and maximize satisfaction. 

### Input 
 User Profile: 
 
- Short-term Intentions: Looking for high-quality tech accessories.
- Long-term Preferences: Prefers durable, high-performance gadgets.
- User Profile: The users likes technical stuff with preferences for windows laptops

### Response:
Technical Accessories (Headphones, Tablet Stylus)
