### Distill step by step finetuning approach - trying enhanced rationale with specific reasoning for date conversion

In [1]:
!sudo pip install -q transformers --upgrade
!sudo pip install -q peft

In [2]:
import transformers
transformers.__version__

'4.35.0'

In [3]:
import os
import torch
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
import pandas as pd
import torch

In [4]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [5]:
context = """{
    "MEASURE": [{"ENTITY": "Discount", "other names": ["discount", "discount rate", "discount value", "deduction"]},
                {"ENTITY": "Purchase Vol", "other names": ["purchase", "purchase value", "purchase model"]},
                {"ENTITY": "Quantity", "other names": ["quantity", "volume"]},
                {"ENTITY": "Sales", "other names": ["sales", "sale"]}],
    "DIMENSION": [{"ENTITY": "Sub-Category", "other names": ["sub-category", "sub category", "categories", "section"]},
                  {"ENTITY": "Segment", "other names": ["segment", "segments", "units", "divisions"]},
                  {"ENTITY": "Parts", "other names": ["parts", "part", "section", "divisions"]},
                  {"ENTITY": "Country", "other names": ["country", "countries"]}],
    "FILTER": [{"ENTITY": "Consumer", "other names": ["consumers", "consumer"], "parent": "Segment"},
               {"ENTITY": "Phone", "other names": ["phone", "phones", "mobile phones"], "parent": "Sub-Category"},
               {"ENTITY": "Binder", "other names": ["binders", "binder"], "parent": "Sub-Category"},
               {"ENTITY": "Corporate", "other names": ["corporates", "corporate"], "parent": "Segment"},
               {"ENTITY": "India", "other names": ["india"], "parent": "Country"},
               {"ENTITY": "Dubai", "other names": ["dubai"], "parent": "Country"}],
    "DERIVED MEASURE": [{"ENTITY": "Ratio",
             "other names": ["ratio", "share", "contribution", "percentage", "proportion", "contributing"]},
            {"ENTITY": "Why", "other names": ["why", "cause of", "reason for", "diagnose"]},
            {"ENTITY": "contribution_to_growth", "other names": ["contribution to growth", "growth", "grown"]},
            {"ENTITY": "kda_transactional", "other names": ["kda", "key drivers", "key driver", "drivers", "driver"]},
            {"ENTITY": "Growth Rate", "other names": ["growth rate", "growth", "grown"]},
            {"ENTITY": "correlation",
             "other names": ["associate", "associated", "association", "associations", "correlate", "correlated",
                             "correlation", "correlations", "relate", "related", "relation", "relations",
                             "relationship",
                             "relationships"]}
            ],
    "DATE VARIABLE": [{"ENTITY": "Order Date", "other names": ["order date", "date", "trend", "time", "when", "mom", "yoy"]}]
    }"""

In [6]:
date_input = {
    "start_date": "01/01/2020",
    "end_date": "15/09/2023"
}

In [7]:
torch.cuda.is_available()

True

In [8]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
                                          # add_eos_token=True,
                                          use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [9]:
from peft import PeftModel, PeftConfig

In [10]:
new_model_name = "/data/mistral/query-to-mql/exp-9/nov-01/checkpoint-4000"

In [11]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(new_model_name, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
query_template_v1 = """Given the context : {context} and date reference: {date_input}, the query: {user_query}, is converted into below shown structured output.
[MQL]
"""

In [13]:
def predict_template_query_v1(user_query):
    inp = query_template_v1.format(context=context,
                                   user_query=user_query,
                                  date_input=date_input)
    _inputs = tokenizer.encode(inp, return_tensors="pt")
    outputs = model.generate(input_ids=_inputs.to('cuda'), max_length= 1700, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    output_new = output.split('[MQL]\n')[1]
    return output_new.split('\n[/MQL]')[0], output
#     return output

In [14]:
def inference(user_query):
    output, raw = predict_template_query_v1(user_query=user_query)
    mql = eval(output)
    steps = 'Step 1:' +raw.split('\nStep 1:')[1]
    return mql, steps

In [None]:
user_query_list = ["sales in jan 2020 versus year ago" ,"list of sales in jan 2019 compared to last 2 months" ,"sales in last 2 months vs year ago" ,"sales in last 2 months compared to prior year" ,"sales in 2020 compared to last 2 months" ,"sales in Q2 2020 compared to prior year" ,"sales in this week" ,"sales in present week" ,"sales in current week" ,"sales in this week vs pp" ,"sales in this week compared to prior year" ,"sales in previous year" ,"sales in this week vs 2020" ,"sales in 2020 vs this week" ,"sales in 2nd week of 2021" ,"sales in last week" ,"sales in  L4W of 2021" ,"sales in last 4 weeks of 2021" ,"sales in present week vs previous week" ,"sales in L2w vs last year" ,"sales in next week" ,"sales in next 2 week" ,"sales in weekly forecast of 2021" ,"sales in past 2 weeks" ,"sales in p2w" ,"sales in jan 2020 versus year ago" ,"sales in jan 2019 compared 2018" ,"sales in jan 2019 compared year ago" ,"sales in jan 2018 vs year ago" ,"sales in jan 2020 vs 2018" ,"sales in 2017 vs year ago" ,"sales in 2018 vs year ago" ,"sales in 2018 vs 2019" ,"sales in 2018 compared to 2020" ,"sales in 2019 compared to year ago" ,"sales in 2020 compared year ago" ,"sales in 2020 vs 2021" ,"sales in sales 2020 vs 2018" ,"sales in last 2 months vs year ago" ,"sales in jan 2019 compared to last 2 months" ,"sales in last 2 months compared to prior year" ,"sales in 2020 compared to last 2 months" ,"sales in Q2 2020 compared to prior year" ,"sales in this week" ,"sales in current week" ,"sales in present week" ,"sales in this week vs ya" ,"sales in ya vs this week" ,"sales in this week vs pp" ,"sales in this week compared to prior year" ,"sales in previous year" ,"sales in this week vs 2020" ,"sales in 2020 vs this week" ,"sales in this week vs L4W" ,"sales in this week vs L2W of 2020" ,"sales in L2W of 2020 vs this week" ,"sales in this week vs last week" ,"sales in this week vs l1w" ,"sales in this week vs last 3 week" ,"sales in 2nd week of 2021 vs this week" ,"sales in L2W" ,"sales in last week" ,"sales in this week vs L4W of 2021" ,"sales in present week vs Previous week" ,"sales in this week" ,"sales in L2w vs last year" ,"sales in across week" ,"sales in last 2 week" ,"sales in last 3 weeks" ,"sales in L2w vs last year" ,"sales in across week" ,"sales in last 2 week" ,"sales in last 3 weeks" ,"sales in P3w" ,"sales in l2w" ,"sales in L10W" ,"sales in L5W" ,"sales in L20D" ,"sales in L20D in 2020" ,"sales in L20D in this year" ,"sales in L20D vs pp" ,"sales in L7D vs p15d" ,"sales in L10D vs L2W" ,"sales in P10D" ,"sales in L10D" ,"sales in p9d" ,"sales in l9d: True" ,"sales in L20D vs prior year" ,"sales in L20D vs last year" ,"sales in L20D in this year vs previous year" ,"sales in L19d" ,"sales in L19d in this year" ,"sales in L19d in last year" ,"sales in l10d vs ya" ,"sales in dec 2021 vs month ago" ,"sales in last month vs ma" ,"sales in last month of 2021 vs MA" ,"sales in 2021 vs ma" ,"sales in q1 2021 vs ma" ,"sales in last week vs ma" ,"sales in last month of fy2021 vs ma" ,"sales in june 2021 vs ma" ,"sales in last 2 months vs ma" ,"sales in this month vs month ago" ,"sales in month ago" ,"sales in ma" ,"sales in growth rate in this month vs ma" ,"sales in ya" ,"sales in mtd vs ma" ,"sales in 2 ma" ,"sales in last month vs 2 ma" ,"sales in this month vs 3 Ma" ,"sales in this month vs Month Ago" ,"sales in this month vs 6 Month ago" ,"sales in mtd vs pp" ,"sales in first half of 2020" ,"sales in second half of 2021" ,"sales in first half of this year" ,"sales in first half of last 2 year" ,"sales in first half of 2021" ,"sales in first half of 2022 vs second half of 2021" ,"sales in second half of 2022" ,"sales in first half of 2023" ,"sales in first half of 2021 vs second half of 2021" ,"sales in first half in this year" ,"sales in 1st half of 2021" ,"sales in first half of 21" ,"sales in q1 in 2019, 2020" ,"sales in aug , sep in 20, 21" ,"sales in l2w of 2020 ,2021" ,"sales in q1 in this year , last year" ,"sales in Aug to sep in 2019,2020" ,"sales in aug to dec in 2019 ,2020" ,"sales in last month in 2019 , 2020" ,"sales in compare 2019 vs ya" ,"sales in Jan 2020 to Aug 2020 vs 2 years ago" ,"sales in Jan 20 to feb 2020 vs Ya" ,"sales in dec 20 to jan 21 vs 2 YA" ,"sales in 31st dec 20 to 10th jan 21 vs 2 YA" ,"sales in q1 2020 to q2 2020 vs 2 ya" ,"sales in YTD from jan 2021" ,"sales in ytd from feb 2020" ,"sales in Aug to sep in 41" ,"sales in Aug to sep  41" ,"sales in Aug to 41 sep" ,"sales in Aug to sep 41" ,"sales in previous year" ,"sales in 2020 vs this week" ,"sales in 2nd week of 2021" ,"sales in L4W of 2021" ,"sales in last 4 weeks of 2021" ,"sales in present week vs previous week" ,"sales in past 2 weeks" ,"sales in p2w" ,"sales in jan 2018 vs year ago" ,"sales in 2018 vs year ago" ,"sales in 2020 compared year ago" ,"sales in 2020 vs 2021" ,"sales in sales 2020 vs 2018" ,"sales in ya vs this week" ,"sales in L2w vs last year" ,"sales in across week" ,"sales in last 2 week" ,"sales in last 3 weeks" ,"sales in P3w" ,"sales in l2w" ,"sales in L10W" ,"sales in L5W" ,"sales in L20D" ,"sales in L20D in 2020" ,"sales in L20D in this year" ,"sales in L20D vs pp" ,"sales in L7D vs p15d" ,"sales in L10D vs L2W" ,"sales in P10D" ,"sales in L10D" ,"sales in p9d" ,"sales in l9d" ,"sales in L20D vs prior year" ,"sales in L20D vs last year" ,"sales in L20D in this year vs previous year" ,"sales in L19d" ,"sales in L19d in this year" ,"sales in L19d in last year" ,"sales in l10d vs ya" ,"sales in dec 2021 vs month ago" ,"sales in last month vs ma" ,"sales in last month of 2021 vs MA" ,"sales in 2021 vs ma" ,"sales in q1 2021 vs ma" ,"sales in last week vs ma" ,"sales in last month of fy2021 vs ma" ,"sales in june 2021 vs ma" ,"sales in last 2 months vs ma" ,"sales in this month vs month ago" ,"sales in month ago" ,"sales in ma" ,"sales in growth rate in this month vs ma" ,"sales in ya" ,"sales in mtd vs ma" ,"sales in 2 ma" ,"sales in last month vs 2 ma" ,"sales in this month vs 3 Ma" ,"sales in this month vs Month Ago" ,"sales in this month vs 6 Month ago" ,"sales in mtd vs pp" ,"sales in first half of 2020" ,"sales in second half of 2021" ,"sales in first half of this year" ,"sales in first half of last 2 year" ,"sales in first half of 2021" ,"sales in first half of 2022 vs second half of 2021" ,"sales in second half of 2022" ,"sales in first half of 2023" ,"sales in first half of 2021 vs second half of 2021" ,"sales in first half in this year" ,"sales in 1st half of 2021" ,"sales in first half of 21" ,"sales in q1 in 2019, 2020" ,"sales in aug , sep in 20, 21" ,"sales in l2w of 2020 ,2021" ,"sales in q1 in this year , last year" ,"sales in Aug to sep in 2019,2020" ,"sales in aug to dec in 2019 ,2020" ,"sales in last month in 2019 , 2020" ,"sales in compare 2019 vs ya" ,"sales in Jan 2020 to Aug 2020 vs 2 years ago" ,"sales in Jan 20 to feb 2020 vs Ya" ,"sales in dec 20 to jan 21 vs 2 YA" ,"sales in 31st dec 20 to 10th jan 21 vs 2 YA" ,"sales in q1 2020 to q2 2020 vs 2 ya" ,"sales in YTD from jan 2021" ,"sales in ytd from feb 2020" ,"sales in 1 january 2021" ,"sales in 1 jan 2021" ,"sales in 12 january 2021" ,"sales in 12 jan 2021" ,"sales in 1st january 2021" ,"sales in 1st jan 2021" ,"sales in 12th january 2021" ,"sales in 12th jan 2021" ,"sales in 1 january 20" ,"sales in 1 jan 20" ,"sales in 12 january 20" ,"sales in 12 jan 20" ,"sales in 1st january 20" ,"sales in 1st jan 20" ,"sales in 12th january 20" ,"sales in 12th jan 20" ,"sales in 2021 july 1" ,"sales in 2021 jul 1" ,"sales in 2021 june 1st" ,"sales in 2021 jun 2nd" ,"sales in 2021 october 22" ,"sales in 2021 oct 21" ,"sales in 2021 oct 22nd" ,"sales in 2021 october 22nd" ,"sales in 21 october 22" ,"sales in 21 oct 21" ,"sales in 2021 21 january" ,"sales in 2021 21 jan" ,"sales in 2021 22nd january" ,"sales in 2021 22nd jun" ,"sales in 2021 2 jan" ,"sales in 2021 1 january" ,"sales in 2021 3rd january" ,"sales in 2021 2nd jan" ,"sales in 20 21 january" ,"sales in 20 21 jan" ,"sales in 20 22nd january" ,"sales in 20 22nd jun" ,"sales in 20 2 jan" ,"sales in 20 1 january" ,"sales in 20 3rd january" ,"sales in 21 2nd jan" ,"sales in 20 july 1" ,"sales in 21 jul 1" ,"sales in 21 june 1st" ,"sales in 21 jun 2nd" ,"sales in 20 oct 22nd" ,"sales in 21 october 22nd" ,"sales in 2021,21 january" ,"sales in 2021,21 jan" ,"sales in 2021,22nd january" ,"sales in 2021,22nd jun" ,"sales in 2021,2 jan" ,"sales in 2021,1 january" ,"sales in 2021,3rd january" ,"sales in 2021,2nd jan" ,"sales in 20,21 january" ,"sales in 20,21 jan" ,"sales in 20,22nd january" ,"sales in 20,22nd jun" ,"sales in 20,2 jan" ,"sales in 20,1 january" ,"sales in 20,3rd january" ,"sales in 21,2nd jan"]
sample_list = ["sales in jan 2020 versus year ago" ,"list of sales in jan 2019 compared to last 2 months" ,"sales in last 2 months vs year ago" ,"sales in last 2 months compared to prior year" ,"sales in 2020 compared to last 2 months" ,"sales in Q2 2020 compared to prior year" ,"sales in this week" ,"sales in present week" ,"sales in current week" ,"sales in this week vs pp" ,"sales in this week compared to prior year" ,"sales in previous year" ,"sales in this week vs 2020" ,"sales in 2020 vs this week" ,"sales in 2nd week of 2021" ,"sales in last week" ,"sales in  L4W of 2021" ,"sales in last 4 weeks of 2021" ,"sales in present week vs previous week" ,"sales in L2w vs last year" ,"sales in next week" ,"sales in next 2 week" ,"sales in weekly forecast of 2021" ,"sales in past 2 weeks" ,"sales in p2w" ,"sales in jan 2020 versus year ago" ,"sales in jan 2019 compared 2018" ,"sales in jan 2019 compared year ago" ,"sales in jan 2018 vs year ago" ,"sales in jan 2020 vs 2018" ,"sales in 2017 vs year ago" ,"sales in 2018 vs year ago" ,"sales in 2018 vs 2019"]

In [None]:
%%time 
data_fin = []
for user_query in sample_list:
    print('user query: ', user_query)
    print('-'*100)
    output, raw = predict_template_query_v1(user_query=user_query)
    print(eval(output))
    print('-'*100)
    steps = 'Step 1:' +raw.split('\nStep 1:')[1]
    print('Step 1:' +raw.split('\nStep 1:')[1])
    print('-'*100)
    data_fin.append([user_query,eval(output), steps])
import csv
with open('/data/data.csv', 'a', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Query", "Intermediate MQL", "Reasoning"])

# Write data iteratively
    for row in data_fin[0:]:
        csvwriter.writerow(row)