## 1. Install necessary libraries



In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## 2. Dataset details

You can download the dataset including two csv files [here](https://www.kaggle.com/competitions/instacart-market-basket-analysis/) (departments.csv and products.csv)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/Data

departments.csv  products.csv


In [None]:
import pandas as pd
df_product = pd.read_csv('/content/drive/MyDrive/Data/products.csv')
df_depart = pd.read_csv('/content/drive/MyDrive/Data/departments.csv')
df_joined = pd.merge(df_product, df_depart, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row : row['product_name'] + ' --> ' + row['department'], axis=1)
df_joined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,text
0,1,Chocolate Sandwich Cookies,61,19,snacks,Chocolate Sandwich Cookies --> snacks
1,16,Mint Chocolate Flavored Syrup,103,19,snacks,Mint Chocolate Flavored Syrup --> snacks
2,25,Salted Caramel Lean Protein & Fiber Bar,3,19,snacks,Salted Caramel Lean Protein & Fiber Bar --> sn...
3,32,Nacho Cheese White Bean Chips,107,19,snacks,Nacho Cheese White Bean Chips --> snacks
4,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,snacks,Organic Sourdough Einkorn Crackers Rosemary --...


In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_joined, train_size=0.8, random_state=1)

In [None]:
from datasets import Dataset, DatasetDict
train_dataset_dict = DatasetDict({
    "train" : Dataset.from_pandas(train_df)
})

## 3. Load LLama2-7b-chat model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype= torch.bfloat16
)

hf_auth = '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'
model_config = AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

model.config.use_cache = False


Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [None]:
print(f"Model loaded on {device}")

Model loaded on cuda:0


## Load the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth,
    trust_remote_code = True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)

## 4. Check the output of base model before fine tuning

In [None]:
import transformers

pipeline = transformers.pipeline(
    'text-generation',
    model = model,
    tokenizer = tokenizer,
    torch_dtype = torch.bfloat16,
    trust_remote_code = True,
    device_map = "auto"
)

sequences = pipeline(
   ["“Free & Clear Stage 4 Overnight Diapers” ->:","Bread Rolls ->:","French Milled Oval Almond Gourmande Soap ->:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Result: “Free & Clear Stage 4 Overnight Diapers” ->: “Free & Clear Stage 4 Overnight Diapers” ->: “Free & Clear Stage 4 Overnight Diapers” ->: “Free & Clear Stage 4 Overnight Diapers”

You can repeat this process as many times as you want, but keep in mind that the more times you repeat it, the more times you'll have to enter the same information.
Alternatively, you can use a spreadsheet or a database management system to keep track of the information and automate the process of entering it into the diaper size chart. This can save you a lot of time and effort in the long run.
I hope this helps! Let me know if you have any other questions.
Result: Bread Rolls ->: 2022-01-26 17:55:00
 hopefully you will like it.
Result: French Milled Oval Almond Gourmande Soap ->:
 Einzelnes ->: 0,75 €/ Stck.

Total ->: 6,25 €

Is this correct?

Answer: No, the calculation is incorrect.

The cost of 10 individually-wrapped French Milled Oval Almond Gourmande soaps is:

10 x 0,75 €/Stck. = 7,50 €

So, th

## 5. Load the trainer

### Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    fp16 = True,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = True,
    lr_scheduler_type = lr_scheduler_type,
)


Pass everything to the trainer

In [None]:
from trl import SFTTrainer

max_seg_length = 512

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset_dict['train'],
    peft_config = peft_config,
    dataset_text_field = 'text',
    max_seq_length = max_seg_length,
    tokenizer = tokenizer,
    args = training_arguments,
)
trainer



Map:   0%|          | 0/39750 [00:00<?, ? examples/s]

<trl.trainer.sft_trainer.SFTTrainer at 0x7b5f6f212cb0>

Pre-process the model by upcasting the layer norms in float32 for more stable training

In [None]:
for name, module in trainer.model.named_modules():
  if 'norm' in name:
    module = module.to(torch.float32)

## 6. Train the model

Now let's train the model. Simply call `trainer.train()`

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.8867
2,4.2762
3,4.2403
4,4.2268
5,4.1094
6,4.4875
7,4.0946
8,4.034
9,4.1821
10,3.8401


TrainOutput(global_step=120, training_loss=2.6549172540505728, metrics={'train_runtime': 904.4487, 'train_samples_per_second': 2.123, 'train_steps_per_second': 0.133, 'total_flos': 598065665310720.0, 'train_loss': 2.6549172540505728, 'epoch': 0.05})

In [None]:
lst_test_data = list(test_df['text'])
sample_size = 25
lst_test_data_short = lst_test_data[:sample_size]

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    torch_dtype = torch.float16,
    trust_remote_code = True,
    device_map = "auto"
)

sequences = pipeline(
    lst_test_data_short,
    max_length = 100,
    do_sample = True,
    top_k = 10,
    num_return_sequences = 1,
    eos_token_id = tokenizer.eos_token_id
)

for ix, seq in enumerate(sequences):
  print(ix,seq[0]['generated_text'])



0 Soothing Bath Treatment --> personal care bath/body wash --> bath --> body care --> dry skin care --> body wash --> personal care --> dry skin wash --> bath care --> body wash for dry skin --> dry skin bath wash --> luxury dry skin wash --> body wash for dry skin care --> dry skin wash --> dry skin body wash --> dry skin bath body wash --> body wash for dry skin --> dry skin wash for body --> lux
1 Carbon Filtered Water --> beverages and drinks --> water --> still water --> carbon filtered water --> sparkling water --> flavored sparkling water --> lemonade --> lemonade with green tea --> iced tea --> iced tea with lemon --> iced tea with lemon and green tea --> juice --> juice drinks --> fruit drinks --> sports drinks --> energy drinks --> energy shots --> energy mixes --> coffee and tea --> hot beverages
2 Breakfast On The Go! Berry Nut Blend Nut & Granola Mix --> breakfast cereals eggs baking goods produce dairy meats seafood --> dairy eggs produce frozen foods household cleaning p

In [None]:
def correct_answer(ans):
  return (ans.split("-->")[1]).strip()

answers = []
for ix,seq in enumerate(sequences):
    # print(ix,seq[0]['generated_text'])
    answers.append(correct_answer(seq[0]['generated_text']))

df_evaluate = test_df.iloc[:sample_size][['product_name','department']]

df_evaluate = df_evaluate.reset_index(drop=True)

df_evaluate['department_predicted'] = answers

df_evaluate

Unnamed: 0,product_name,department,department_predicted
0,Soothing Bath Treatment,personal care,personal care bath/body wash
1,Carbon Filtered Water,beverages,beverages and drinks
2,Breakfast On The Go! Berry Nut Blend Nut & Gra...,breakfast,breakfast cereals eggs baking goods produce da...
3,Organic Sensible Sipper Apple Juice,beverages,beverages
4,Parchment Baking Paper,pantry,pantry dry goods snacks baking supplies
5,Chicken Fingers With Macaroni & Cheese,frozen,frozen foods seafood and fish frozen meals and...
6,Naked Blues,snacks,snacks sea salt & pepper almonds
7,Lori's Lemon Tea,beverages,beverages tea iced tea
8,Key Lime Pie,snacks,snacks bread snacks frozen meals ice cream fro...
9,Orange Drink Mix,beverages,beverages
