In [1]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft

In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

In [3]:
df = pd.read_csv('/kaggle/input/dataset/Dejavu_data.csv')

In [4]:
def create_input(row):
    # Create user context for helping recall forgotten search queries
    context = (
        f"User Context:\n"
        f"- Recent Activity: {row['recent_activity']}\n"
        f"- People Interacted With: {row['people_interacted_with']}\n"
        f"- Ongoing Task: {row['ongoing_task']}\n"
        f"- Past Searches: {row['past_searches']}\n"
        f"- Current Environment: {row['current_environment']}\n\n"
        f"Question:\n"
        f"Based on my recent activities and ongoing tasks, can you help me recall what I might have wanted to search for?"
    )

    return context

def create_output(row):
    # Construct an output suggesting possible search queries
    output = (
        f"Considering your contextual cues, you might be looking for information on {row['potential_search']}. Does this sound right?"
    )
    return output

# Create 'input' column in the DataFrame using the updated 'create_input'
df['input'] = df.apply(create_input, axis=1)

# Create 'output' column using the updated 'create_output'
df['output'] = df.apply(create_output, axis=1)

# Combine 'input' and 'output' into a 'text' column for model training
df['text'] = df.apply(lambda row: f"input: {row['input']}\noutput: {row['output']}", axis=1)

In [5]:
# Remove rows with missing values
df.dropna(subset=['input', 'output'], inplace=True)

# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

In [6]:
# Prepare the text column for training
df['text'] = df.apply(lambda row: f"input: {row['input']}\noutput: {row['output']}", axis=1)

In [7]:
# Display an example input-output pair
example_row = df.iloc[0]
print("Input:")
print(example_row['input'])
print("\nOutput:")
print(example_row['output'])

Input:
User Context:
- Recent Activity: Looking for workout routines
- People Interacted With: Gym trainer
- Ongoing Task: Trying out a new fitness regime
- Past Searches: Beginner workout plans, best fitness apps
- Current Environment: Gym

Question:
Based on my recent activities and ongoing tasks, can you help me recall what I might have wanted to search for?

Output:
Considering your contextual cues, you might be looking for information on Full Body Workout Routine. Does this sound right?


In [8]:
from datasets import Dataset

# Remove any rows with missing values in 'text'
df.dropna(subset=['text'], inplace=True)

# Create Dataset from pandas DataFrame
train_data = Dataset.from_pandas(df[['text']])

# Verify the Dataset
print(train_data)

Dataset({
    features: ['text'],
    num_rows: 60
})


In [9]:
# Initialize Model
model_name = "/kaggle/input/gemma/transformers/2b-it/3"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    low_cpu_mem_usage=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model.config.hidden_activation = 'gelu_pytorch_tanh'

max_seq_length = 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Fine-Tune the Model
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

In [11]:
# Training arguments with reduced batch size and sequence length
training_arguments = TrainingArguments(
    output_dir="dejavu_model",
    num_train_epochs=5,  # Allow more time for contextual learning
    per_device_train_batch_size=1,  # Start with 1, adjust based on memory
    gradient_accumulation_steps=8,  # Adjust based on batch size
    optim="paged_adamw_32bit",
    save_steps=0,  # Save periodically
    logging_steps=25,
    learning_rate=1e-5,  # Lower learning rate for fine-tuning
    weight_decay=0.01,  # Slightly higher to regularize better
    fp16=True,
    max_grad_norm=1.0,  # Increase to allow for more flexibility
    warmup_ratio=0.1,  # Slightly longer warmup for stability
    lr_scheduler_type="cosine",
    report_to="none",
)

In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Start training
trainer.train()

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Step,Training Loss
25,4.1135


TrainOutput(global_step=35, training_loss=3.94153071812221, metrics={'train_runtime': 138.2422, 'train_samples_per_second': 2.17, 'train_steps_per_second': 0.253, 'total_flos': 365234877788160.0, 'train_loss': 3.94153071812221, 'epoch': 4.666666666666667})

In [13]:
# Save and Merge Model
trainer.save_model()
tokenizer.save_pretrained("dejavu_model")

from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
     "dejavu_model",
     torch_dtype=compute_dtype,
     device_map="auto",
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./dejavu_pretrained", safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./dejavu_pretrained")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('./dejavu_pretrained/tokenizer_config.json',
 './dejavu_pretrained/special_tokens_map.json',
 './dejavu_pretrained/tokenizer.model',
 './dejavu_pretrained/added_tokens.json',
 './dejavu_pretrained/tokenizer.json')

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the path to your pretrained model directory
model_name = "./dejavu_pretrained"

# Load the model without device_map
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a function to generate search recall
def get_search_recall(user_context, model=model, tokenizer=tokenizer):
    prompt = f"input: {user_context}\noutput:"
    # Tokenize the input and move tensors to the same device as the model
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split('output:')[-1].strip()

# Example usage
example_input = df.iloc[0]['input']
print("User Context and Question:")
print(example_input)
print("\nGenerated Search Suggestion:")
print(get_search_recall(example_input))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


User Context and Question:
User Context:
- Recent Activity: Looking for workout routines
- People Interacted With: Gym trainer
- Ongoing Task: Trying out a new fitness regime
- Past Searches: Beginner workout plans, best fitness apps
- Current Environment: Gym

Question:
Based on my recent activities and ongoing tasks, can you help me recall what I might have wanted to search for?

Generated Search Suggestion:
Based on your recent context, you might be looking for beginner workout plan, or trying out new gym equipment.
