In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
# from unsloth import FastLanguageModel
# import torch
# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/Phi-3.5-mini-instruct",
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
#     # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
# )

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Vishal-Sagar/phi-text-to-sql-basic",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

Unsloth 2024.12.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

TypeError: Unsloth: Your model already has LoRA adapters. Your new parameters are different.

In [None]:
# !unzip lora_model.zip -d lora_model/

In [None]:
# from unsloth import FastLanguageModel
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
# )
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference


<a name="Data"></a>
### Data Prep
We now use the `Phi-3` format for conversation style finetunes. We use [Open Assistant conversations](https://huggingface.co/datasets/philschmid/guanaco-sharegpt-style) in ShareGPT style. Phi-3 renders multi turn conversations like below:

```
<|user|>
Hi!<|end|>
<|assistant|>
Hello! How are you?<|end|>
<|user|>
I'm doing great! And you?<|end|>

```

**[NOTE]** To train only on completions (ignoring the user's input) read Unsloth's docs [here](https://github.com/unslothai/unsloth/wiki#train-on-completions--responses-only-do-not-train-on-inputs).

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old` and our own optimized `unsloth` template.

Note ShareGPT uses `{"from": "human", "value" : "Hi"}` and not `{"role": "user", "content" : "Hi"}`, so we use `mapping` to map it.

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [None]:
# from datasets import load_dataset
# import json
# from datasets import Dataset

# def convert_to_sharegpt(text):
#     parts = text.split("###")
#     if len(parts) < 3:
#       return None
#     input_text = "".join(parts[:-2]) + parts[-2]
#     output_text = parts[-1]

#     # Remove \n from human response
#     input_text = input_text.replace('\n', ' ')

#     # Remove "Response:" from gpt response and keep only the SQL query
#     output_text = output_text.replace("Response: ", "")

#     # return [
#     #     {"from": "human", "value": input_text},
#     #     {"from": "gpt", "value": output_text},
#     # ]
#     return [
#         {"role": "user", "content": input_text},
#         {"role": "assistant", "content": output_text},
#     ]

# # Example usage
# dataset = load_dataset("Clinton/Text-to-sql-v1", split="train")

# # Convert each text to ShareGPT format and save as JSON
# sharegpt_data = []
# for item in dataset:
#     converted_data = convert_to_sharegpt(item["text"])
#     if converted_data:
#         sharegpt_data.append({"conversations": converted_data}) # Changed here

# dataset = Dataset.from_list(sharegpt_data)

In [4]:
from datasets import load_dataset
import json
from datasets import Dataset

def convert_to_sharegpt(example):

    instruction = f"""Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: {example['question']}. Input: {example['context']}"""

    return [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": example['answer']}

    ]

# Example usage
dataset = load_dataset("b-mc2/sql-create-context", split="train")

# Convert each text to ShareGPT format and save as JSON
sharegpt_data = []
for item in dataset:
    converted_data = convert_to_sharegpt(item)
    if converted_data:
        sharegpt_data.append({"conversations": converted_data}) # Changed here

dataset = Dataset.from_list(sharegpt_data)

README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [5]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

Let's see how the `Phi-3` format works by printing the 5th element

In [6]:
dataset[0]

{'conversations': [{'content': 'Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: How many heads of the departments are older than 56 ?. Input: CREATE TABLE head (age INTEGER)',
   'role': 'user'},
  {'content': 'SELECT COUNT(*) FROM head WHERE age > 56',
   'role': 'assistant'}],
 'text': '<|user|>\nBelow are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: How many heads of the departments are older than 56 ?. Input: CREATE TABLE head (age INTEGER)<|end|>\n<|assistant|>\nSELECT COUNT(*) FROM head WHERE age > 56<|end|>\n'}

In [7]:
dataset[5]["conversations"]

[{'content': 'Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: What are the names of the heads who are born outside the California state?. Input: CREATE TABLE head (name VARCHAR, born_state VARCHAR)',
  'role': 'user'},
 {'content': "SELECT name FROM head WHERE born_state <> 'California'",
  'role': 'assistant'}]

In [8]:
print(dataset[5]["text"])

<|user|>
Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: What are the names of the heads who are born outside the California state?. Input: CREATE TABLE head (name VARCHAR, born_state VARCHAR)<|end|>
<|assistant|>
SELECT name FROM head WHERE born_state <> 'California'<|end|>



If you're looking to make your own chat template, that also is possible! You must use the Jinja templating regime. We provide our own stripped down version of the `Unsloth template` which we find to be more efficient, and leverages ChatML, Zephyr and Alpaca styles.

More info on chat templates on [our wiki page!](https://github.com/unslothai/unsloth/wiki#chat-templates)

In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 5,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        # num_train_epochs = 2,
        max_steps=1500,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `Phi-3`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [9]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

instruction = "Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. "
qs = "List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born."
table_schema = "CREATE TABLE management (department_id VARCHAR, head_id VARCHAR); CREATE TABLE head (born_state VARCHAR, head_id VARCHAR); CREATE TABLE department (department_id VARCHAR, name VARCHAR)"

messages = [
    {"from": "human", "value": f"{instruction} Instruction: {qs} Input: {table_schema}"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|user|> Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables.  Instruction: List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born. Input: CREATE TABLE management (department_id VARCHAR, head_id VARCHAR); CREATE TABLE head (born_state VARCHAR, head_id VARCHAR); CREATE TABLE department (department_id VARCHAR, name VARCHAR)<|end|><|assistant|> To find the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born, you can use the following SQL query:\n\n```sql\nSELECT t1.born_state\nFROM head AS t1\nJOIN management AS t2 ON t1."]

In [10]:
FastLanguageModel.for_inference(model)

# Function to perform inference on a single row
def infer_row(row):
    """
    Performs inference on a single row of the dataset.

    Args:
        row: A dictionary containing the 'conversations' and 'text' fields.

    Returns:
        A tuple containing the input, predicted output, and actual output.
    """
    instruction = row['conversations'][0]['content']


    messages = [
        {"from": "human", "value": instruction}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=256, use_cache=True)
    predicted_output = tokenizer.batch_decode(outputs)[0].split("<|end|>")[1]

    actual_output = row['conversations'][1]['content']

    return instruction, predicted_output, actual_output


results = []

# Iterate through the first 100 rows of the dataset
a = 0
for row in dataset:
    if a>100: break
    print(a)

    input_text, predicted_output, actual_output = infer_row(row)
    results.append({'input': input_text, 'predicted_output': predicted_output, 'actual_output': actual_output})

    a+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [11]:
cleaned_results = []
for result in results:
  predicted_output = result['predicted_output'].replace("<|assistant|> ", "")
  cleaned_result = {
      'input': result['input'],
      'predicted_output': predicted_output,
      'actual_output': result['actual_output']
  }
  cleaned_results.append(cleaned_result)

In [41]:
import re

def extract_sql_query(text):
    """
    Extracts SQL queries from text, handling code blocks and inline queries.

    Args:
        text: The input text.

    Returns:
        A list of extracted SQL queries, or an empty list if none are found.
    """
    sql_queries = []

    # Regex to find SQL code blocks (```sql ... ```)
    code_block_regex = r"```sql\s*(.*?)\s*```"
    code_block_matches = re.findall(code_block_regex, text, re.DOTALL | re.IGNORECASE)
    sql_queries.extend(code_block_matches)

    # Process extracted queries (remove newlines, add semicolon if needed)
    processed_queries = []
    for query in sql_queries:
        query = query.strip()
        query = query.replace('\n', ' ')
        query = query.replace('\r', ' ')
        # if query and not query.endswith(';'):  # Check for empty strings
        #     query += ';'
        if query: # only add if not empty string
          processed_queries.append(query)

    if not len(processed_queries):
        return text + ";"
    return processed_queries[0]


In [None]:
# extract_sql_query()

In [42]:
correct = 0
wrong = 0

for row in cleaned_results[:5]:
    print("raaaaaaaw : ", row['predicted_output'])
    processed_predicted = extract_sql_query(row['predicted_output'])
    print("predicted : ", processed_predicted)
    print("actual sql: ", row['actual_output'])
    print()
    # if row['predicted_output'] == row['actual_output']+";" or row['predicted_output'] == row['actual_output']:
    #     correct+=1
    # else:
    #   wrong += 1
    #   print(row)
    #   print()

raaaaaaaw :  To answer the question, we need to write an SQL query that calculates the number of heads in the "head" table whose age is greater than 56. Here is the SQL query:

```sql
SELECT COUNT(*) FROM head WHERE age > 56;
```

This query uses the COUNT(*) function to count the number of rows that satisfy the condition where the age is greater than 56. The result will be a single number representing the count of heads older than 56.
predicted :  SELECT COUNT(*) FROM head WHERE age > 56;
actual sql:  SELECT COUNT(*) FROM head WHERE age > 56

raaaaaaaw :  To fulfill the given task, we can write an SQL query that selects the name, born state, and age of the heads of departments, and then orders the results by age. Here is the SQL query:

```sql
SELECT name, born_state, age
FROM head
ORDER BY age;
```

This query will return the name, born state, and age of the heads of departments, sorted by age in ascending order.
predicted :  SELECT name, born_state, age FROM head ORDER BY age;
actua

In [19]:
wrong

97

Input: Some text before 'SELECT * FROM users;' and some after.
Output: None
--------------------
Input: No SQL here.
Output: None
--------------------
Input: Another example with "UPDATE products SET price = 100 WHERE id = 1"; more text.
Output: None
--------------------
Input: A more complex SQL query: 
SELECT * 
FROM orders
WHERE date > '2023-10-26';
Output: SELECT * FROM ordersWHERE date > '2023-10-26';
--------------------
Input: SELECT * FROM mytable -- this is a comment
Output: SELECT * FROM mytable -- this is a comment;
--------------------
Input: /* This is a 
 multi-line 
 comment */ SELECT * from test;
Output: SELECT * from test;
--------------------
Input: `SELECT * FROM test`;
Output: None
--------------------
Input: 
    Some text
    SELECT * FROM users
    WHERE name = 'O'Reilly';
    More text
    
Output: SELECT * FROM users    WHERE name = 'O'Reilly';
--------------------
Input: SELECT * FROM users WHERE name = "O'Reilly";
Output: SELECT * FROM users WHERE name = "O'R

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
from google.colab import userdata

In [None]:
# model.save_pretrained("lora_model") # Local saving
# tokenizer.save_pretrained("lora_model")
model.push_to_hub("Vishal-Sagar/phi-text-to-sql-basic", token = userdata.get('HF_TOKEN')) # Online saving
tokenizer.push_to_hub("Vishal-Sagar/phi-text-to-sql-basic", token = userdata.get('HF_TOKEN')) # Online saving

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/239M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Vishal-Sagar/phi-text-to-sql-basic


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
# prompt: make zip folder of lora_model and download it

!zip -r lora_model.zip lora_model
!ls
from google.colab import files
files.download("lora_model.zip")

  adding: lora_model/ (stored 0%)
  adding: lora_model/adapter_config.json (deflated 56%)
  adding: lora_model/tokenizer.json (deflated 85%)
  adding: lora_model/README.md (deflated 66%)
  adding: lora_model/tokenizer_config.json (deflated 84%)
  adding: lora_model/tokenizer.model (deflated 55%)
  adding: lora_model/special_tokens_map.json (deflated 76%)
  adding: lora_model/added_tokens.json (deflated 62%)
  adding: lora_model/adapter_model.safetensors (deflated 8%)
huggingface_tokenizers_cache  lora_model  lora_model.zip  outputs  sample_data


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
!pip install databench-eval

Collecting databench-eval
  Downloading databench_eval-2.1.1-py3-none-any.whl.metadata (7.0 kB)
Downloading databench_eval-2.1.1-py3-none-any.whl (7.8 kB)
Installing collected packages: databench-eval
Successfully installed databench-eval-2.1.1


In [None]:
import pandas as pd
from databench_eval import Evaluator
import ast


# Initialize the evaluator
evaluator = Evaluator()

# Load the CSV files
dev_set = pd.read_csv("data/qwen_output.csv")
correct = 0
error = 0
wrong_queries = []
error_queries = []
correct_queries = []
new_dev = pd.read_csv("data/train_set.csv")


for index, row in dev_set.head(100).iterrows():
    try:
        question = row["question"]
        print("Question:", row['question'])
        dataset = row["dataset"]
        answer = new_dev[new_dev['question'] == question]["answer"].values
        answer = answer[0]

        df = pd.read_csv(f"data/datasets2/{dataset}.csv")

        query = row['queries']
        print("Query:", query)
        predicted_answer = eval(query, {"df": df, "pd": pd})

        print("Our answer:", predicted_answer)
        print("Actual answer:", answer)

        semantic_type = row['type']
        result = evaluator.compare(value=predicted_answer, truth=answer, semantic=semantic_type)
        print("Equal or not:", result, end="\n\n")

        if result == True:
            correct += 1
            correct_queries.append(row.to_dict())

        else:
            wrong_queries.append(row.to_dict())

    except Exception as e:
        print("Error query: ", row['query'])
        error+=1
        error_queries.append(row.to_dict())
        print(e, end="\n\n\n")
        continue

print("Error: ", error)
print("Correct: ", correct)
print("Giving wrong answer: ", len(wrong_queries))
accuracy = correct/100
print(accuracy)


correct_queries_df = pd.DataFrame(correct_queries)
correct_queries_df.to_csv("data/correct_queries_from_wrong.csv", index=False)
