## Step 1: Load Configuration and Model Paths

In [4]:
# Step 1: Import necessary modules and load configurations
import sys
import json
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add parent directory to path

# Now import the config settings
from config import TOKENIZER_PATH, MODEL_PATH, PIPELINE_PARAMS, QLORA_PARAMS, ALPACA_LORA_PROMPTS_USER_PROFILE

# Verification
print("Configuration Loaded:")
print("Tokenizer Path:", TOKENIZER_PATH)
print("Model Path:", MODEL_PATH)
print("Pipeline Parameters:", PIPELINE_PARAMS)
print("QLoRA Parameters:", QLORA_PARAMS)
print("Prompt Template:", ALPACA_LORA_PROMPTS_USER_PROFILE)


Configuration Loaded:
Tokenizer Path: models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct
Model Path: models/hf-frompretrained-downloadmeta-llama/Meta-Llama-3-8B-Instruct
Pipeline Parameters: {'max_length': 2048, 'num_return_sequences': 1, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.95, 'repetition_penalty': 1.2, 'load_in_4bit': True}
QLoRA Parameters: {'lora_r': 4, 'lora_alpha': 16, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'v_proj'], 'lora_num_epochs': 1, 'lora_val_iterations': 100, 'lora_early_stopping_patience': 10, 'lora_lr': 0.0001, 'lora_micro_batch_size': 8}
Prompt Template: {'instruction': 'Based on the following user reviews, generate a user profile that includes: Short-term Intentions, Long-term Preferences, Item Descriptions, Preferences and Dislikes, Explicit and Implicit Signals, and Contextual Features. Think step by step', 'input': '{user_review}', 'output': '{user_profile}'}


## Step 2: Load and Verify Training Data

In [5]:
# Step 2: Load and verify training data
data_path = "QLoRa_finetuning\chatGPT_UP_output.json"

# Load the training data
with open(data_path, "r") as file:
    training_data = json.load(file)

# Sample a couple of data points to verify format
print("Training Data Sample:")
print(training_data[:2])  # Display first two entries

# Verify the data structure
print("Data Structure Verification:")
for i, sample in enumerate(training_data[:2]):
    assert "User_ID" in sample, f"User_ID missing in sample {i}"
    assert "User_Profile" in sample, f"User_Profile missing in sample {i}"
    assert "Candidate_Items" in sample, f"Candidate_Items missing in sample {i}"

print("Data verification successful!")


Training Data Sample:
[{'User_ID': 'AE3LUVAAITFJIUTWBMRPHDQOCOFQ', 'User_Profile': 'User Profile: Short-term Intentions: Exploring skincare products that promote glowing skin, such as lotions and creams. Interested in makeup accessories like dramatic false eyelashes for special occasions. Long-term Preferences: Consistent interest in high-quality beauty tools and accessories (e.g., nail files). Regularly uses beauty products that enhance appearance and have attractive packaging. Item Descriptions: High-End Nail Files: Thick, durable, and exceed expectations in quality. Glowing Skin Lotion: Leaves skin radiant; no scent; comes in pretty packaging. Dramatic False Eyelashes: Soft, reusable, stay on all night; require trimming to fit. Preferences and Dislikes: Preferences: High-quality beauty products that deliver excellent results. Products that enhance appearance (e.g., glowing skin, dramatic lashes). Attractive and functional packaging. Dislikes: Products that make skin excessively oily

## Step 3: Initialize the Tokenizer and Model

In [6]:
# Step 3: Initialize the Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS if not already set

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

# Verification
print("Tokenizer and Model Loaded Successfully.")
print("Tokenizer:", tokenizer)
#print("Model Parameters Count:", model.num_parameters())


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]

Tokenizer and Model Loaded Successfully.
Tokenizer: PreTrainedTokenizerFast(name_or_path='models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special




## Step 4: Define Preprocessing Function with Matched User Reviews


In [7]:
# Load the user reviews from new_train_output.json
reviews_path = "new_data/new_train_output.json"

with open(reviews_path, "r") as file:
    reviews_data = json.load(file)

# Index reviews by user_id for easy matching
reviews_by_user = {entry["user_id"]: entry["reviews"] for entry in reviews_data}

# Verification: Check indexed reviews for a specific user_id
print("Sample Reviews for User ID 'AFSKPY37N3C43SOI5IEXEK5JSIYA':")
print(reviews_by_user.get("AFSKPY37N3C43SOI5IEXEK5JSIYA", "No reviews found for this user ID."))

# Step 4: Updated Preprocessing Function to Match Reviews with Profiles
def preprocess_function(profile_sample):
    user_id = profile_sample["User_ID"]
    
    # Retrieve and format the user's reviews as input
    reviews = reviews_by_user.get(user_id, [])
    review_texts = [f"Product: {review['product_name']}\nRating: {review['rating']}\nTitle: {review['title']}\nReview: {review['text']}\n" 
                    for review in reviews]
    formatted_reviews = "\n".join(review_texts)
    
    # Construct the instruction, input, and response
    instruction = ALPACA_LORA_PROMPTS_USER_PROFILE['instruction']
    input_text = f"User reviews:\n{formatted_reviews}" if formatted_reviews else "No reviews available for this user."
    output_text = profile_sample["User_Profile"]
    
    # Full training prompt
    full_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
    return full_text

# Verification
print("Preprocessed Sample:")
print(preprocess_function(training_data[0]))  # Process the first entry in chatGPT_UP_output.json



Sample Reviews for User ID 'AFSKPY37N3C43SOI5IEXEK5JSIYA':
[{'product_name': 'Keratin Secrets Do It Yourself Home Keratin System', 'parent_asin': 'B07SLFWZKN', 'rating': 3.0, 'title': 'Just ok', 'text': "I try to get Keratin treatments every 3 months, but honestly it has been getting costly. I found it difficult to use and almost impossible to get to saturate the back of my hair and straight iron it the way they do in the salon. Then I saw the ingredients after the first time and saw it contained formaldehyde and that was the last time I used the actual treatment. I wish they sold the S&C separate because I really did like it and I am always in the market for a good hair wash which won't strip my hair between treatments. I will resume my regular treatments at my salon.", 'timestamp': 1619737501209}, {'product_name': 'GAINWELL', 'parent_asin': 'B08JTNQFZY', 'rating': 5.0, 'title': 'Good quality hair brush!', 'text': 'Really nice small brush. Made well, nice wood made with boar bristle, 

## Step 5: Tokenize and Prepare Data for Training

In [8]:
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq, Trainer

# Step 5: Tokenize and Prepare Data
def tokenize_function(sample):
    # Process the sample using the preprocessing function
    processed_text = preprocess_function(sample)
    
    # Tokenize the processed text
    tokenized = tokenizer(
        processed_text,
        truncation=True,
        max_length=PIPELINE_PARAMS['max_length'],
        padding="max_length",
        return_tensors="pt"
    )
    
    # Set labels to be identical to input_ids (for causal LM training)
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Apply tokenization to the entire dataset
tokenized_data = [tokenize_function(sample) for sample in training_data]

## Step 6: Define Dataset, LoRA Configuration, and Training Parameters

In [18]:
import torch
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# Step 6: Define Training Sample Limit
num_training_samples = 16  # Set the number of training samples here
training_data = training_data[:num_training_samples]

# LoRA Configuration
lora_config = LoraConfig(
    r=QLORA_PARAMS['lora_r'],
    lora_alpha=QLORA_PARAMS['lora_alpha'],
    lora_dropout=QLORA_PARAMS['lora_dropout'],
    target_modules=QLORA_PARAMS['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA configuration to the model
model = get_peft_model(model, lora_config)

# Set training parameters
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=QLORA_PARAMS['lora_micro_batch_size'],
    gradient_accumulation_steps=QLORA_PARAMS['lora_micro_batch_size'],
    num_train_epochs=QLORA_PARAMS['lora_num_epochs'],
    evaluation_strategy="steps",
    eval_steps=QLORA_PARAMS['lora_val_iterations'],
    save_steps=QLORA_PARAMS['lora_val_iterations'],
    logging_steps=10,
    learning_rate=QLORA_PARAMS['lora_lr'],
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"  # Set this to "tensorboard" if needed
)

# Verification of configurations
print("Training Parameters:")
print("Training Samples:", len(training_data))
print("LoRA Configuration:", lora_config)
print("Training Arguments:", training_args)


Training Parameters:
Training Samples: 16
LoRA Configuration: LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=4, target_modules=['q_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
Training Arguments: TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_p

## Step 7: Prepare Trainer and Data Collator