In [1]:
!sudo pip install -q transformers==4.35.0

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
import torch

In [4]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [5]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
                                          # add_eos_token=True,
                                          use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [6]:
model = AutoModelForCausalLM.from_pretrained("/data/mistral/query-to-mql/exp-10/nov-20/merged-model",
                                            torch_dtype=torch.bfloat16, device_map="cpu")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
context = """{
    "MEASURE": [{"ENTITY": "Discount", "other names": ["discount", "discount rate", "discount value", "deduction"]},
                {"ENTITY": "Purchase Vol", "other names": ["purchase", "purchase value", "purchase model"]},
                {"ENTITY": "Quantity", "other names": ["quantity", "volume"]},
                {"ENTITY": "Sales", "other names": ["sales", "sale"]}],
    "DIMENSION": [{"ENTITY": "Sub-Category", "other names": ["sub-category", "sub category", "categories", "section"]},
                  {"ENTITY": "Segment", "other names": ["segment", "segments", "units", "divisions"]},
                  {"ENTITY": "Parts", "other names": ["parts", "part", "section", "divisions"]},
                  {"ENTITY": "Country", "other names": ["country", "countries"]}],
    "FILTER": [{"ENTITY": "Consumer", "other names": ["consumers", "consumer"], "parent": "Segment"},
               {"ENTITY": "Phone", "other names": ["phone", "phones", "mobile phones"], "parent": "Sub-Category"},
               {"ENTITY": "Binder", "other names": ["binders", "binder"], "parent": "Sub-Category"},
               {"ENTITY": "Corporate", "other names": ["corporates", "corporate"], "parent": "Segment"},
               {"ENTITY": "India", "other names": ["india"], "parent": "Country"},
               {"ENTITY": "Dubai", "other names": ["dubai"], "parent": "Country"}],
    "DERIVED MEASURE": [{"ENTITY": "Ratio",
             "other names": ["ratio", "share", "contribution", "percentage", "proportion", "contributing"]},
            {"ENTITY": "Why", "other names": ["why", "cause of", "reason for", "diagnose"]},
            {"ENTITY": "contribution_to_growth", "other names": ["contribution to growth", "growth", "grown"]},
            {"ENTITY": "kda_transactional", "other names": ["kda", "key drivers", "key driver", "drivers", "driver"]},
            {"ENTITY": "Growth Rate", "other names": ["growth rate", "growth", "grown"]},
            {"ENTITY": "correlation",
             "other names": ["associate", "associated", "association", "associations", "correlate", "correlated",
                             "correlation", "correlations", "relate", "related", "relation", "relations",
                             "relationship",
                             "relationships"]}
            ],
    "DATE VARIABLE": [{"ENTITY": "Order Date", "other names": ["order date", "date", "trend", "time", "when", "mom", "yoy"]}]
    }"""

date_input = {
    "start_date": "01/01/2020",
    "end_date": "15/09/2023"
}

In [8]:
query_template_v1 = """Given the context : {context} and date reference: {date_input}, the query: {user_query}, is converted into below shown structured output.
[MQL]
"""

In [10]:
import pandas as pd
df = pd.read_csv('/data/mistral/query-to-mql/exp-10/training_data.csv')

df=df.head(100)

In [11]:
df.shape

(100, 3)

In [12]:
def predict_template_query_v1(user_query):
    inp = query_template_v1.format(context=context,
                                   user_query=user_query,
                                  date_input=date_input)
    _inputs = tokenizer.encode(inp, return_tensors="pt")
    outputs = model.generate(input_ids=_inputs, max_new_tokens = 200, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    output_new = output.split('[MQL]\n')[1]
    return output_new.split('\n[/MQL]')[0], output

In [14]:
from tqdm import tqdm

In [15]:
predicted_mql = {}

for i, row in tqdm(df.iterrows()):
    user_query = row['Query']
    output, _ = predict_template_query_v1(user_query)
    predicted_mql[user_query]=output

100it [2:43:43, 98.23s/it]


In [16]:
new_df = pd.DataFrame(predicted_mql.items(), columns=['query', 'predicted_mql'])
new_df.to_csv("predicted_mql_unquantized_model.csv", index=False)

In [17]:
new_df

Unnamed: 0,query,predicted_mql
0,what is purchase across segments,{'DIMENSION': {'segments': [{'ENTITY': 'Segmen...
1,discount rate of phone and binders,"{'DATE': {'': []}, 'DERIVED MEASURE': {'discou..."
2,discount rate of overall sub-category in corpo...,{'DIMENSION': {'sub-category': [{'ENTITY': 'Su...
3,maximum sales of phone for consumer segment,{'DIMENSION': {'segment': [{'ENTITY': 'Segment...
4,forecast of sales,"{'MEASURE': {'sales': [{'ENTITY': 'Sales', 'ME..."
...,...,...
95,Sales in 2nd last year,{'DATE VARIABLE': {'2nd last year': [{'CONVERT...
96,sales in last to last year,{'DATE VARIABLE': {'last to last year': [{'CON...
97,sales in last to last month,{'DATE VARIABLE': {'last to last month': [{'CO...
98,sales in second last month,{'DATE VARIABLE': {' second last month': [{'CO...


In [19]:
q_df = pd.read_csv("predicted_mql_llama-cpp.csv")

In [21]:
q_df['unqunt_mql']=new_df['predicted_mql']

In [25]:
a = q_df['predicted_mql']==q_df['unqunt_mql']

In [29]:
a.value_counts()

True     88
False    12
Name: count, dtype: int64

In [30]:
score = 88/100

In [31]:
score

0.88