In [1]:
!sudo pip install -q transformers --upgrade

In [2]:
import transformers
transformers.__version__

'4.34.1'

In [3]:
#!sudo pip install -q accelerate peft==0.4.0 bitsandbytes trl==0.4.7

In [4]:
import os
import torch
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd
import torch

In [5]:
df = pd.read_csv('/data/mistral/query-to-mql/Live_Usage_queries_with_mql_formatted.csv')

In [6]:
df.columns

Index(['query', 'mql', 'account_id', 'metadata', 'measure', 'dimension',
       'derived_measure', 'date', 'measure_mql', 'dimension_mql', 'action_mql',
       'date_mql', 'measure_new'],
      dtype='object')

In [7]:
df.shape

(17334, 13)

In [8]:
g = torch.Generator().manual_seed(1234)

In [9]:
train_size = 1000

In [10]:
rows = torch.randint(0,df.shape[0],(train_size,))

In [11]:
train_df = df[['query','measure_new','measure_mql']].iloc[rows.tolist()]

In [12]:
val_df = df[['query','measure_new','measure_mql']].drop(rows.tolist())
val_df = val_df[:30]

In [13]:
val_df.shape, train_df.shape

((30, 3), (1000, 3))

In [14]:
val_df.to_csv('val_df_query_to_mql.csv', index=True)
train_df.to_csv('train_df_query_to_mql.csv', index=True)

In [6]:
train_df = pd.read_csv('train_df_query_to_mql.csv')

In [16]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# The instruction dataset to use
#dataset_name = ""

# Fine-tuned model name
#new_model = "mistral-ft-peft-on-template_and_user_query-data"

In [17]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [18]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

In [51]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/data/mistral/query-to-mql/0ct-25"

# Number of training epochs
num_train_epochs = 15

# Enable fp16/bf16 training (set bf16 to True with an A100)
# fp16 = False
fp16 = True # not using quantisation
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True


# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = 1000

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 50

# Log every X updates steps
logging_steps = 50

In [52]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [21]:
promt_template_measure = """<s>[INST]<<SYS>>
Given the CONTEXT:{context}, convert the 'user query' into JSON format which captures basic measures asked by user and maps it to CONTEXT.

<</SYS>>
User query : {user_query}

Converted JSON is as shown below: 
[/INST]
[MEASUREMQL]
{measure_mql}
[/MEASUREMQL]</s>
"""

In [62]:
promt_template_v1 = """<s>[INST]<<SYS>>
You are an assistant that helps to map the user question to the a particular JSON format which contains info asked by user and also maps it the below CONTEXT. You might also need to act as a time tagger expert to convert the date elements present in the question to a standard format and to find possible date ranges for the same.

CONTEXT:{context}

Step 1: Identify the n-grams match between question and context

        Map the n-gram or their lemma or their inflections from the question with the values in the passed context.
        Always consider the longest n-gram match, not the sub-string.
        If there are multiple matches for an n-gram with context, return all such ENTITY in response.
        If you are returning any match which is not exactly present with the context, make sure that it is a noun phrase and there is a high similarity between the match and the matched value in context. 


Step 2: Applying time tagger rules only if time elements are present in question

        Identify the TIME ELEMENTS in the input question and convert it to a standard format (if not already) by applying the general time tagging rules. If the TIME ELEMENT is already in a standard format, then no need to convert it.
        TIME ELEMENT can be either a temporal interval (across months, yoy, mom, qoq, wow, quarterly etc.) or a temporal expression (time points such as specific dates, relative expressions etc.).
        Calculate date range for each time points based on the following conditions:
        1. For relative time expressions, calculate the date range based on a reference date - By default the reference date is the end_date in date input: {date_input}
        2. To calculate the date range for "last X years", strictly follow below conditions:
                For "last 1 year", consider exactly one year before the reference year and set start date as January 1 and end date as Decemebr 31 of that year.
                For "last X years", where X is greater than 1, consider starting year = (reference year - X+1) and set start date as January 1 of starting year and end date as the reference date.
        3. To calculate the date range for "last X months", strictly follow below conditions:
                Consider reference month as the month in reference date.
                For "last 1 month", consider exactly one month before the reference month and set start date as first day and end date as last day of that month.
                For "last X months", where X is greater than 1, consider starting month = (reference month - X+1) and set start date as first day of starting month and end date as the reference date. (Example: if reference date is 14/09/2022, then last 3 months = 01/07/2022 - 14/09/2022)
        4. To calculate the date range for "last X quarters", strictly follow below conditions:
                For "last 1 quarter", consider exactly one quarter before the reference quarter and set start date as first day and end date as last day of that quarter .
                For "last X quarter", where X is greater than 1, consider starting quarter = (reference quarter - X+1) and set start date as first day of starting quarter and end date as the reference date.
        5. To calculate the date range for "last X weeks", strictly follow the below conditions:
                Consider reference week as the week in reference week.
                For "last 1 week", set start date as Monday and end date as Sunday of the previous week of reference week. (Example: if reference date is 14/09/2022, then last week = 05/09/2022 - 11/09/2022)
                For "last X weeks", set start date as Monday of reference week and set start date as the Monday of that week and end date as reference date. 
        6. Provide the date range of each time point in "start date - end date" format always.

<</SYS>>
User question is : {user_query}

Converted JSON is as shown below: 
[/INST]
[MQL]
{mql}
[/MQL]</s>"""

In [22]:
train_df.columns

Index(['Unnamed: 0', 'query', 'measure_new', 'measure_mql'], dtype='object')

In [23]:
def create_fine_tuning_dataset(row):
    mql = eval(row['mql'])[0]['mql']
    user_query = row['query']
    date_input = {"start_date": "01/01/2020", "end_date": "15/09/2023"}
    context = row['metadata_none_removed']
    formated = promt_template_v1.format(context=context,
                                        date_input = date_input,
                                        user_query=user_query,
                                        mql=mql)
    return formated

In [24]:
def create_fine_tuning_dataset_measure(row):
    measure_mql = row['measure_mql']
    user_query = row['query']
    context = row['measure_new']
    formated = promt_template_measure.format(context=context,
                                             user_query=user_query,
                                             measure_mql=measure_mql)
    return formated

In [25]:
train_df['fine_tuning_dataset']=train_df.apply(create_fine_tuning_dataset_measure, axis=1)

In [26]:
# train_df['fine_tuning_dataset']=train_df.apply(create_fine_tuning_dataset, axis=1)

In [27]:
#val_df['fine_tuning_dataset']=val_df.apply(create_fine_tuning_dataset, axis=1)

In [28]:
val_df['fine_tuning_dataset']=val_df.apply(create_fine_tuning_dataset_measure, axis=1)

In [29]:
train_df.shape

(1000, 5)

In [30]:
train_df = train_df[['fine_tuning_dataset']]

In [31]:
train_df.shape

(1000, 1)

In [32]:
val_df = val_df[['fine_tuning_dataset']]
val_df.shape

(30, 1)

In [33]:
val_df.reset_index(inplace=True)
train_df.reset_index(inplace=True)

In [34]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [35]:
train_dataset

Dataset({
    features: ['index', 'fine_tuning_dataset'],
    num_rows: 1000
})

In [36]:
val_dataset

Dataset({
    features: ['index', 'fine_tuning_dataset'],
    num_rows: 30
})

In [37]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [38]:
bnb_4bit_quant_type

'nf4'

In [39]:
compute_dtype

torch.float16

In [40]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [12]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [8]:
!sudo pip install -q pynvml

In [9]:
from pynvml.smi import nvidia_smi
nvsmi = nvidia_smi.getInstance()
nvsmi.DeviceQuery('memory.free, memory.total')

{'gpu': [{'fb_memory_usage': {'total': 16384.0,
    'free': 15972.9375,
    'unit': 'MiB'}}]}

In [44]:
#!df -H

In [10]:
torch.cuda.is_available()

True

In [46]:
# del model
# torch.cuda.empty_cache()

In [47]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
#     torch_dtype=torch.bfloat16,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
                                          # add_eos_token=True,
                                          use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [49]:
max([len(tokenizer.encode(train_df['fine_tuning_dataset'][i])) for i in range(train_size)])

242

In [53]:
# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules = ["q_proj", "v_proj"],
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [54]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    eval_steps=50, # requires when eval_dataset is defined
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps", # requires when eval_dataset is defined
    logging_strategy="steps",
    logging_steps=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=1000,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [55]:
## Getting FLOPs of model

model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, 512)
      )
    }
  )
  * training_arguments.gradient_accumulation_steps
)

#print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

Memory footprint 4.551360512 GB
Flops 21843.947814912 GFLOPs


In [56]:
train_dataset

Dataset({
    features: ['index', 'fine_tuning_dataset'],
    num_rows: 1000
})

In [57]:
#print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [57]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="fine_tuning_dataset",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [58]:
# Train model
trainer.train()

Step,Training Loss,Validation Loss
50,0.4454,0.551327
100,0.355,0.46749
150,0.2866,0.373558
200,0.1925,0.278459
250,0.1936,0.239976
300,0.1806,0.254446
350,0.2318,0.221894
400,0.1887,0.22762
450,0.1713,0.228516
500,0.1807,0.230014


TrainOutput(global_step=1000, training_loss=0.268141952611506, metrics={'train_runtime': 3434.8712, 'train_samples_per_second': 1.165, 'train_steps_per_second': 0.291, 'total_flos': 2.7503331250569216e+16, 'train_loss': 0.268141952611506, 'epoch': 4.0})

In [None]:
# Fine-tuned model name
new_model_name = "mistral-ft-peft-v1-lr-64-with-more-data"

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [59]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (v_proj)

In [12]:
from peft import PeftModel, PeftConfig

In [5]:
new_model_name = "/data/mistral/query-to-mql/0ct-25/checkpoint-800"

In [6]:
#del model
torch.cuda.empty_cache()

In [7]:
#del model
torch.cuda.empty_cache()

from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(new_model_name, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [83]:
# output_merged_dir = os.path.join(new_model_name, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

In [8]:
query_template_v1 = """<s>[INST]<<SYS>>
Given the CONTEXT:{context}, convert the 'user query' into JSON format which captures basic measures asked by user and maps it to CONTEXT.

<</SYS>>
User query : {user_query}

Converted JSON is as shown below: 
[/INST]
[MEASUREMQL]
"""

In [9]:
#model_to_merge.to('cuda')

In [22]:
df['query'][0],df['measure'][0],df['measure_mql'][0]

('trend of profit',
 "{'measure': 'PROFIT', 'measure_label': 'Profit in Dollars'}",
 "{'measure': {'profit': {'PROFIT': {'label': 'Profit in Dollars', 'order': 'desc', 'operator': 'sum'}}}}")

In [10]:
def predict_template_query_v1(user_query, context):
    inp = query_template_v1.format(context=context,
                                   user_query=user_query)
    _inputs = tokenizer.encode(inp, return_tensors="pt")
    outputs = model.generate(input_ids=_inputs.to('cuda'), max_length= 256, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    output_new = output.split('[MEASUREMQL]\n')[1]
    return output_new.split('\n[/MEASUREMQL]')[0]

### Testing on data used for query to intermediate using openai prompt base approach

In [15]:
%%time
i=0
user_query = 'show me the 2 top segments basis sales'
print('user query: ', user_query)
context = "{'measure': ['SCHEDULEDHOURS', 'Sales', 'PROFIT'], 'measure_label': ['Number Requests For Shift','PROFIT']}"
output = predict_template_query_v1(user_query=user_query,context=context)
output

user query:  show me the 2 top segments basis sales
CPU times: user 6.41 s, sys: 218 ms, total: 6.63 s
Wall time: 6.63 s


"{'measure': None}"

In [17]:
%%time
i=0
user_query = 'top 2 and bottom 3 segments by sales'
print('user query: ', user_query)
context = "{'measure': ['SCHEDULEDHOURS', 'Sales', 'PROFIT'], 'measure_label': ['Number Requests For Shift','Sales']}"
output = predict_template_query_v1(user_query=user_query,context=context)
output

user query:  top 2 and bottom 3 segments by sales
CPU times: user 6.46 s, sys: 174 ms, total: 6.64 s
Wall time: 6.64 s


"{'measure': None}"

In [18]:
%%time
i=0
user_query = 'top 2 segments and bottom 3 sub-category basis quantity'
print('user query: ', user_query)
context = "{'measure': ['SCHEDULEDHOURS', 'Quantity',], 'measure_label': ['Quantity in millions']}"
output = predict_template_query_v1(user_query=user_query,context=context)
output

user query:  top 2 segments and bottom 3 sub-category basis quantity
CPU times: user 8.19 s, sys: 191 ms, total: 8.38 s
Wall time: 8.38 s


"{'measure': None}"

In [None]:
%%time
i=0
user_query = 'top 2 segments and bottom 3 sub-category basis quantity'
print('user query: ', user_query)
context = "{'measure': ['SCHEDULEDHOURS', 'Quantity',], 'measure_label': ['Quantity in millions']}"
output = predict_template_query_v1(user_query=user_query,context=context)
output

In [25]:
%%time
user_query = 'why did profit changed in may 2023'
print('user query: ', user_query)
context = "{'measure': ['PROFIT'], 'measure_label': ['Number Requests For Shift','PROFIT']}"
output = predict_template_query_v1(user_query=user_query,context=context)
output

user query:  why did profit changed in may 2023
CPU times: user 8.5 s, sys: 184 ms, total: 8.68 s
Wall time: 8.69 s


"{'measure': None}"

In [24]:
%%time
i=0
user_query = df['query'][i]
print('user query: ', user_query)
context = df['measure_new'][i]
output = predict_template_query_v1(user_query=user_query,context=context)
output, df['measure_mql'][i]

user query:  trend of profit


  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


CPU times: user 7.88 s, sys: 220 ms, total: 8.1 s
Wall time: 8.11 s


("{'measure': {'profit': {'PROFIT': {'label': 'Profit in Dollars', 'order': 'desc', 'operator': 'sum'}}}}",
 "{'measure': {'profit': {'PROFIT': {'label': 'Profit in Dollars', 'order': 'desc', 'operator': 'sum'}}}}")

In [25]:
%%time
i=2
user_query = df['query'][i]
print('user query: ', user_query)
context = df['measure_new'][i]
output = predict_template_query_v1(user_query=user_query,context=context)
output, df['measure_mql'][i]

user query:  why profit change in july 2022
CPU times: user 7.12 s, sys: 261 ms, total: 7.38 s
Wall time: 7.38 s


("{'measure': {'profit': {'PROFIT': {'label': 'Profit in Dollars', 'order': 'desc', 'operator': 'sum'}}}}",
 "{'measure': {'profit': {'PROFIT': {'label': 'Profit in Dollars', 'order': 'desc', 'operator': 'sum'}}}}")

In [26]:
train_df.columns

Index(['Unnamed: 0', 'query', 'measure_new', 'measure_mql'], dtype='object')

In [27]:
train_df.rename(columns={'Unnamed: 0':'index'}, inplace=True)

In [28]:
train_df.columns

Index(['index', 'query', 'measure_new', 'measure_mql'], dtype='object')

In [29]:
df.shape[0]

17334

In [30]:
untrained_index = set(range(df.shape[0]))-set(train_df['index'].tolist())

In [31]:
%%time
i=2000
user_query = df['query'][i]
print('user query: ', user_query)
context = df['measure_new'][i]
output = predict_template_query_v1(user_query=user_query,context=context)
output, df['measure_mql'][i]

user query:  what is growth of number of providers across arizona
CPU times: user 5.44 s, sys: 204 ms, total: 5.64 s
Wall time: 5.64 s


("{'measure': None}", "{'measure': None}")

In [32]:
import random
from tqdm import tqdm

In [33]:
untrained_500_random = random.choices(list(untrained_index), k=500)

In [44]:
prediction = []
for i in tqdm(untrained_500_random):
    user_query = df['query'][i]
    context = df['measure_new'][i]
    output = predict_template_query_v1(user_query=user_query,context=context)
    prediction.append([df['measure_mql'][i],output])

100%|██████████| 500/500 [58:00<00:00,  6.96s/it] 


In [47]:
prediction[0][1]

"{'measure': None}"

In [48]:
pred_actual = []
pred_pred = []

for p in prediction:
    pred_actual.append(p[0])
    pred_pred.append(p[1])

In [55]:
df.columns

Index(['query', 'mql', 'account_id', 'metadata', 'measure', 'dimension',
       'derived_measure', 'date', 'measure_mql', 'dimension_mql', 'action_mql',
       'date_mql', 'measure_new'],
      dtype='object')

In [59]:
type(untrained_500_random)

list

In [62]:
pred_query = list(df['query'][untrained_500_random])
pred_measure = list(df['measure_new'][untrained_500_random])

In [63]:
pred_df = pd.DataFrame()

In [65]:
pred_df['query'] = pred_query
pred_df['measure_metadata'] = pred_measure
pred_df['actual_mql'] = pred_actual
pred_df['pred_mql'] = pred_pred

In [82]:
pred_df.to_csv('/data/mistral/query-to-mql/0ct-25/pred_df.csv', index=False)

In [74]:
correct = []
for i,row in pred_df.iterrows():
    if row['actual_mql']==row['pred_mql']:
        correct.append(1)
    else:
        correct.append(0)

In [75]:
pred_df['correct']=correct

In [84]:
pred_df

Unnamed: 0,query,measure_metadata,actual_mql,pred_mql,correct
0,what are fill rate in aug 2022 for cna,"{'measure': ['FSP_LONGITUDE', 'FILL_RATE_ASSIG...",{'measure': None},{'measure': None},1
1,what is trend of number of requested shifts,"{'measure': ['AFS_NUMBER_REQUESTS_FOR_SHIFT', ...",{'measure': None},{'measure': None},1
2,what is worked hours in quarter2 2022 vs quart...,"{'measure': ['SRD_RATE', 'FPP_LATITUDE', 'AFS_...",{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,1
3,what is monthly growth rate of number of facil...,"{'measure': ['FILL_RATE_OPENED', 'Active Facil...",{'measure': None},{'measure': None},1
4,what is daily trend of number of facility in 2023,"{'measure': ['FFP_LONGITUDE', 'FOP_LATITUDE'],...",{'measure': None},{'measure': None},1
...,...,...,...,...,...
495,what will be worked hours in next 6 months,"{'measure': ['FFP_NUM_TEAM_MEMBERS', 'AFS_DEFA...",{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,0
496,what is trend of profit,"{'measure': ['OPENED_HOURS', 'AFS_CONTRACT_RAT...",{'measure': {'profit': {'PROFIT': {'label': 'P...,{'measure': {'profit': {'PROFIT': {'label': 'P...,1
497,top 5 state contributing to number of provider...,"{'measure': ['FFP_LATITUDE', 'PROFIT'], 'measu...",{'measure': None},{'measure': None},1
498,what is monthly trend of filled hours in texas...,"{'measure': ['FPP_LATITUDE', 'COUNT(DISTINCT F...",{'measure': None},{'measure': None},1


In [81]:
for i, k in pred_df[pred_df['correct']==0]

Unnamed: 0,actual_mql,actual_mql.1
20,{'measure': {'profit': {'PROFIT': {'label': 'P...,{'measure': {'profit': {'PROFIT': {'label': 'P...
28,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...
32,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...
35,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...
36,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...
...,...,...
459,{'measure': {'profit': {'PROFIT': {'label': 'P...,{'measure': {'profit': {'PROFIT': {'label': 'P...
462,{'measure': {'profit': {'PROFIT': {'label': 'P...,{'measure': {'profit': {'PROFIT': {'label': 'P...
478,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...
480,{'measure': {'worked hours': {'AFS_WORKED_HOUR...,{'measure': {'worked hours': {'AFS_WORKED_HOUR...


In [17]:
prompt = query_template_v2.format(user_query='brands least profitable in 2021')

In [18]:
tokens = tokenizer.encode(prompt, return_tensors="pt")

In [22]:
%%time
outputs = model.generate(input_ids=tokens.to('cuda'), max_length= 180, )

CPU times: user 6.61 s, sys: 135 ms, total: 6.75 s
Wall time: 6.75 s


In [23]:
tokenizer.decode(outputs[0])

'<s>[INST]<<SYS>>\nYou are an advanced template converter that converts user question to a specific template which answers the user question.\n\n<</SYS>>\n\nbrands least profitable in 2021\n[/INST]\n[LUMINTEMPLATE]\nList of brands with lowest profit in 2021\n[/LUMINTEMPLATE]\n\nWhich are the top 5 brands based on profit share in 2021\n[/LUMINTEMPLATE]\n\nWhich are the top 5 brands based on market share in 2021\n[/LUMINTEMPLATE]\n\nWhich are the top 5 brands based on sales share in 2021\n[/LUMINTEMPLATE]\n\nWhich are the top 5'