# LIBRARIES AND SETUP



In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [None]:
%%capture
%pip install wandb

In [None]:
from huggingface_hub import login
import wandb


hf_token = "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"

login(token = hf_token)

wb_token = "094590d0aa8813c0cc044d53c48dbf393da80d96"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B TEXT-TO-SQL',
    job_type="training",
    anonymous="allow"
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mmaatvo[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Lora ADAPTERS

We now add LoRA adapters to reduce the number of parameters to update

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 256,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Load the Dataset

In [None]:
my_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["sql_prompt"]
    inputs       = examples["sql_context"]
    outputs      = examples["sql"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = my_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("gretelai/synthetic_text_to_sql", split = "train")
dataset_test = load_dataset("gretelai/synthetic_text_to_sql", split = 'test')
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset_test = dataset_test.map(formatting_prompts_func, batched = True,)

# Filter the dataset for rows where sql_task_type is 'analytics and reporting'
dataset = dataset.filter(lambda example: example["sql_task_type"] == "analytics and reporting")
dataset_test = dataset_test.filter(lambda example: example["sql_task_type"] == "analytics and reporting")

print('Dataset train examples after filtering: ', len(dataset), 'Dataset test examples after filtering: ', len(dataset_test))

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5851 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5851 [00:00<?, ? examples/s]

Dataset train examples after filtering:  88186 Dataset test examples after filtering:  5148


In [None]:
dataset_test

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation', 'text'],
    num_rows: 5851
})

# TRAINING SETUP

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from transformers import EarlyStoppingCallback


early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of evaluations to wait before stopping
    early_stopping_threshold=0.0,  # Minimum change to qualify as an improvement
)



trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset = dataset_test.select(range(50)),
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=300,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        evaluation_strategy="steps",  # Perform evaluation during training
        eval_steps=10,  # Evaluate every 10 steps
        save_steps=60,  # Save the model every 10 steps
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="wandb",
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # Whether higher metric values are better (set False for loss)
    ),
    callbacks=[early_stopping_callback],  # Add the early stopping callback here
)




Map (num_proc=2):   0%|          | 0/88186 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


# Training

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
8.332 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 88,186 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 180
 "-____-"     Number of trainable parameters = 671,088,640


Step,Training Loss,Validation Loss
10,0.6199,0.517685
20,0.6178,0.518018
30,0.4454,0.517114
40,0.5404,0.496605
50,0.5224,0.494544
60,0.552,0.492135
70,0.4978,0.48391
80,0.5088,0.488577
90,0.4802,0.479506
100,0.5025,0.467338


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1689.4747 seconds used for training.
28.16 minutes used for training.
Peak reserved memory = 12.928 GB.
Peak reserved memory for training = 4.596 GB.
Peak reserved memory % of max memory = 87.659 %.
Peak reserved memory for training % of max memory = 31.164 %.


# Inference

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    my_prompt.format(
        , # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

By using TextStreamer we are able to see the generation token by token

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    my_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

# Evaluation

In [None]:
import re
import random
import sqlparse
import difflib
from tqdm import tqdm
from unsloth import FastLanguageModel

range_inference = 300 # Choose any number!
accuracy = 0
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def extract_sql_statement(response_text):
    """
    Extract the SQL statement from the response text.
    """
    # Define the regex pattern to extract the SQL statement
    pattern = r'### Response:\n(.*?);'
    match = re.search(pattern, response_text, re.DOTALL)
    if match:
        # Return the extracted SQL statement
        return match.group(1).strip()
    return ""

def format_sql(sql):
    """
    Format SQL queries to a standard format.
    """
    return sqlparse.format(sql.strip(), reindent=True, keyword_case='upper')

def evaluate_similarity(generated_response, ground_truth_sql):
    """
    Evaluate the similarity between the generated response and ground truth SQL.
    """
    # Extract SQL statement from the generated response
    generated_sql = extract_sql_statement(generated_response)

    # Format both SQL queries for comparison
    formatted_generated_sql = format_sql(generated_sql)
    formatted_ground_truth = format_sql(ground_truth_sql)

    # Calculate similarity score
    similarity = difflib.SequenceMatcher(None, formatted_generated_sql, formatted_ground_truth).ratio()
    return similarity

# Process each example in the test dataset
for example in tqdm(dataset_test.shuffle(seed=154).select(range(range_inference)), desc="Evaluating"):
    # Create the input prompt
    input_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{example['sql_prompt']}

### Input:
{example['sql_context']}

### Response:
"""

    # Tokenize and prepare the input
    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")

    # Generate the output SQL query
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Evaluate the similarity
    ground_truth_sql = example['sql']
    similarity_score = evaluate_similarity(generated_response, ground_truth_sql)
    print(f"Similarity Score: {similarity_score}")
    # Assuming a similarity threshold of 0.8 for correctness
    if similarity_score > 0.8:
        accuracy += 1
    accuracy_percentage = (accuracy / range_inference) * 100
    print(f"Model accuracy on test set: {accuracy_percentage:.2f}%")

# Calculate and print overall accuracy
accuracy_percentage = (accuracy / range_inference) * 100
print(f"Model accuracy on test set: {accuracy_percentage:.2f}%")


Evaluating:   0%|          | 1/300 [00:03<19:01,  3.82s/it]

Similarity Score: 0.715261958997722
Model accuracy on test set: 0.00%


Evaluating:   1%|          | 2/300 [00:05<12:29,  2.52s/it]

Similarity Score: 0.8652482269503546
Model accuracy on test set: 0.33%


Evaluating:   1%|          | 3/300 [00:06<09:30,  1.92s/it]

Similarity Score: 0.9404761904761905
Model accuracy on test set: 0.67%


Evaluating:   1%|▏         | 4/300 [00:08<09:33,  1.94s/it]

Similarity Score: 0.6640316205533597
Model accuracy on test set: 0.67%


Evaluating:   2%|▏         | 5/300 [00:12<13:41,  2.78s/it]

Similarity Score: 0.5014577259475219
Model accuracy on test set: 0.67%


Evaluating:   2%|▏         | 6/300 [00:14<11:30,  2.35s/it]

Similarity Score: 0.9941520467836257
Model accuracy on test set: 1.00%


Evaluating:   2%|▏         | 7/300 [00:17<12:56,  2.65s/it]

Similarity Score: 0.8466666666666667
Model accuracy on test set: 1.33%


Evaluating:   3%|▎         | 8/300 [00:19<12:05,  2.48s/it]

Similarity Score: 0.8018867924528302
Model accuracy on test set: 1.67%


Evaluating:   3%|▎         | 9/300 [00:21<10:43,  2.21s/it]

Similarity Score: 0.6524064171122995
Model accuracy on test set: 1.67%


Evaluating:   3%|▎         | 10/300 [00:23<10:35,  2.19s/it]

Similarity Score: 0.9949748743718593
Model accuracy on test set: 2.00%


Evaluating:   4%|▎         | 11/300 [00:26<12:08,  2.52s/it]

Similarity Score: 0.547085201793722
Model accuracy on test set: 2.00%


Evaluating:   4%|▍         | 12/300 [00:31<14:38,  3.05s/it]

Similarity Score: 0.9294403892944039
Model accuracy on test set: 2.33%


Evaluating:   4%|▍         | 13/300 [00:32<12:38,  2.64s/it]

Similarity Score: 0.995260663507109
Model accuracy on test set: 2.67%


Evaluating:   5%|▍         | 14/300 [00:34<10:48,  2.27s/it]

Similarity Score: 0.5934065934065934
Model accuracy on test set: 2.67%


Evaluating:   5%|▌         | 15/300 [00:35<09:40,  2.04s/it]

Similarity Score: 0.9947089947089947
Model accuracy on test set: 3.00%


Evaluating:   5%|▌         | 16/300 [00:37<08:43,  1.84s/it]

Similarity Score: 0.98
Model accuracy on test set: 3.33%


Evaluating:   6%|▌         | 17/300 [00:39<08:48,  1.87s/it]

Similarity Score: 0.9938650306748467
Model accuracy on test set: 3.67%


Evaluating:   6%|▌         | 18/300 [00:41<09:52,  2.10s/it]

Similarity Score: 0.8856088560885609
Model accuracy on test set: 4.00%


Evaluating:   6%|▋         | 19/300 [00:44<10:26,  2.23s/it]

Similarity Score: 0.8562691131498471
Model accuracy on test set: 4.33%


Evaluating:   7%|▋         | 20/300 [00:46<10:18,  2.21s/it]

Similarity Score: 0.7983193277310925
Model accuracy on test set: 4.33%


Evaluating:   7%|▋         | 21/300 [00:47<09:00,  1.94s/it]

Similarity Score: 0.9925925925925926
Model accuracy on test set: 4.67%


Evaluating:   7%|▋         | 22/300 [00:50<10:17,  2.22s/it]

Similarity Score: 0.851063829787234
Model accuracy on test set: 5.00%


Evaluating:   8%|▊         | 23/300 [00:53<11:10,  2.42s/it]

Similarity Score: 0.8060453400503779
Model accuracy on test set: 5.33%


Evaluating:   8%|▊         | 24/300 [00:55<10:05,  2.20s/it]

Similarity Score: 0.8692579505300353
Model accuracy on test set: 5.67%


Evaluating:   8%|▊         | 25/300 [00:57<09:59,  2.18s/it]

Similarity Score: 0.6265060240963856
Model accuracy on test set: 5.67%


Evaluating:   9%|▊         | 26/300 [00:59<09:54,  2.17s/it]

Similarity Score: 0.5623003194888179
Model accuracy on test set: 5.67%


Evaluating:   9%|▉         | 27/300 [01:00<09:04,  2.00s/it]

Similarity Score: 0.8095238095238095
Model accuracy on test set: 6.00%


Evaluating:   9%|▉         | 28/300 [01:02<08:53,  1.96s/it]

Similarity Score: 0.9945945945945946
Model accuracy on test set: 6.33%


Evaluating:  10%|▉         | 29/300 [01:04<08:59,  1.99s/it]

Similarity Score: 0.6909090909090909
Model accuracy on test set: 6.33%


Evaluating:  10%|█         | 30/300 [01:07<10:13,  2.27s/it]

Similarity Score: 0.851063829787234
Model accuracy on test set: 6.67%


Evaluating:  10%|█         | 31/300 [01:09<09:01,  2.01s/it]

Similarity Score: 0.7116564417177914
Model accuracy on test set: 6.67%


Evaluating:  11%|█         | 32/300 [01:11<08:49,  1.98s/it]

Similarity Score: 0.8297872340425532
Model accuracy on test set: 7.00%


Evaluating:  11%|█         | 33/300 [01:14<10:02,  2.26s/it]

Similarity Score: 0.7785547785547785
Model accuracy on test set: 7.00%


Evaluating:  11%|█▏        | 34/300 [01:15<08:26,  1.90s/it]

Similarity Score: 0.9885057471264368
Model accuracy on test set: 7.33%


Evaluating:  12%|█▏        | 35/300 [01:17<09:18,  2.11s/it]

Similarity Score: 0.8545454545454545
Model accuracy on test set: 7.67%


Evaluating:  12%|█▏        | 36/300 [01:19<09:23,  2.13s/it]

Similarity Score: 0.9795918367346939
Model accuracy on test set: 8.00%


Evaluating:  12%|█▏        | 37/300 [01:22<09:32,  2.18s/it]

Similarity Score: 0.8896321070234113
Model accuracy on test set: 8.33%


Evaluating:  13%|█▎        | 38/300 [01:24<09:32,  2.19s/it]

Similarity Score: 0.9028213166144201
Model accuracy on test set: 8.67%


Evaluating:  13%|█▎        | 39/300 [01:28<12:21,  2.84s/it]

Similarity Score: 0.631762652705061
Model accuracy on test set: 8.67%


Evaluating:  13%|█▎        | 40/300 [01:32<13:01,  3.01s/it]

Similarity Score: 0.6816479400749064
Model accuracy on test set: 8.67%


Evaluating:  14%|█▎        | 41/300 [01:33<10:29,  2.43s/it]

Similarity Score: 0.990990990990991
Model accuracy on test set: 9.00%


Evaluating:  14%|█▍        | 42/300 [01:34<09:16,  2.16s/it]

Similarity Score: 0.0
Model accuracy on test set: 9.00%


Evaluating:  14%|█▍        | 43/300 [01:36<08:27,  1.97s/it]

Similarity Score: 0.9936305732484076
Model accuracy on test set: 9.33%


Evaluating:  15%|█▍        | 44/300 [01:38<08:35,  2.01s/it]

Similarity Score: 0.88
Model accuracy on test set: 9.67%


Evaluating:  15%|█▌        | 45/300 [01:39<07:52,  1.85s/it]

Similarity Score: 0.6875
Model accuracy on test set: 9.67%


Evaluating:  15%|█▌        | 46/300 [01:44<11:53,  2.81s/it]

Similarity Score: 0.8162162162162162
Model accuracy on test set: 10.00%


Evaluating:  16%|█▌        | 47/300 [01:45<09:35,  2.27s/it]

Similarity Score: 0.5529953917050692
Model accuracy on test set: 10.00%


Evaluating:  16%|█▌        | 48/300 [01:47<09:00,  2.14s/it]

Similarity Score: 0.6796116504854369
Model accuracy on test set: 10.00%


Evaluating:  16%|█▋        | 49/300 [01:50<09:42,  2.32s/it]

Similarity Score: 0.456973293768546
Model accuracy on test set: 10.00%


Evaluating:  17%|█▋        | 50/300 [01:54<11:12,  2.69s/it]

Similarity Score: 0.7728155339805826
Model accuracy on test set: 10.00%


Evaluating:  17%|█▋        | 51/300 [01:57<12:36,  3.04s/it]

Similarity Score: 0.452755905511811
Model accuracy on test set: 10.00%


Evaluating:  17%|█▋        | 52/300 [01:58<10:04,  2.44s/it]

Similarity Score: 0.9914529914529915
Model accuracy on test set: 10.33%


Evaluating:  18%|█▊        | 53/300 [02:01<10:11,  2.48s/it]

Similarity Score: 0.10606060606060606
Model accuracy on test set: 10.33%


Evaluating:  18%|█▊        | 54/300 [02:03<09:05,  2.22s/it]

Similarity Score: 0.7358490566037735
Model accuracy on test set: 10.33%


Evaluating:  18%|█▊        | 55/300 [02:04<08:01,  1.96s/it]

Similarity Score: 0.6617100371747212
Model accuracy on test set: 10.33%


Evaluating:  19%|█▊        | 56/300 [02:05<06:37,  1.63s/it]

Similarity Score: 0.9908256880733946
Model accuracy on test set: 10.67%


Evaluating:  19%|█▉        | 57/300 [02:06<06:20,  1.57s/it]

Similarity Score: 0.9940119760479041
Model accuracy on test set: 11.00%


Evaluating:  19%|█▉        | 58/300 [02:10<08:30,  2.11s/it]

Similarity Score: 0.41297935103244837
Model accuracy on test set: 11.00%


Evaluating:  20%|█▉        | 59/300 [02:11<07:28,  1.86s/it]

Similarity Score: 0.9115646258503401
Model accuracy on test set: 11.33%


Evaluating:  20%|██        | 60/300 [02:15<10:31,  2.63s/it]

Similarity Score: 0.0
Model accuracy on test set: 11.33%


Evaluating:  20%|██        | 61/300 [02:16<08:25,  2.12s/it]

Similarity Score: 0.9873417721518988
Model accuracy on test set: 11.67%


Evaluating:  21%|██        | 62/300 [02:18<08:31,  2.15s/it]

Similarity Score: 0.8961038961038961
Model accuracy on test set: 12.00%


Evaluating:  21%|██        | 63/300 [02:22<09:46,  2.48s/it]

Similarity Score: 0.995475113122172
Model accuracy on test set: 12.33%


Evaluating:  21%|██▏       | 64/300 [02:24<09:08,  2.33s/it]

Similarity Score: 0.993006993006993
Model accuracy on test set: 12.67%


Evaluating:  22%|██▏       | 65/300 [02:26<08:52,  2.27s/it]

Similarity Score: 0.8444444444444444
Model accuracy on test set: 13.00%


Evaluating:  22%|██▏       | 66/300 [02:28<08:41,  2.23s/it]

Similarity Score: 0.7756653992395437
Model accuracy on test set: 13.00%


Evaluating:  22%|██▏       | 67/300 [02:30<08:15,  2.13s/it]

Similarity Score: 0.994535519125683
Model accuracy on test set: 13.33%


Evaluating:  23%|██▎       | 68/300 [02:31<07:25,  1.92s/it]

Similarity Score: 0.8870292887029289
Model accuracy on test set: 13.67%


Evaluating:  23%|██▎       | 69/300 [02:33<06:38,  1.73s/it]

Similarity Score: 0.9904761904761905
Model accuracy on test set: 14.00%


Evaluating:  23%|██▎       | 70/300 [02:36<09:06,  2.38s/it]

Similarity Score: 0.7401129943502824
Model accuracy on test set: 14.00%


Evaluating:  24%|██▎       | 71/300 [02:40<10:16,  2.69s/it]

Similarity Score: 0.7219512195121951
Model accuracy on test set: 14.00%


Evaluating:  24%|██▍       | 72/300 [02:41<08:38,  2.27s/it]

Similarity Score: 0.9922480620155039
Model accuracy on test set: 14.33%


Evaluating:  24%|██▍       | 73/300 [02:43<08:03,  2.13s/it]

Similarity Score: 0.6188340807174888
Model accuracy on test set: 14.33%


Evaluating:  25%|██▍       | 74/300 [02:46<09:31,  2.53s/it]

Similarity Score: 0.9733333333333334
Model accuracy on test set: 14.67%


Evaluating:  25%|██▌       | 75/300 [02:49<08:58,  2.39s/it]

Similarity Score: 0.9939393939393939
Model accuracy on test set: 15.00%


Evaluating:  25%|██▌       | 76/300 [02:51<09:04,  2.43s/it]

Similarity Score: 0.5573122529644269
Model accuracy on test set: 15.00%


Evaluating:  26%|██▌       | 77/300 [02:56<11:18,  3.04s/it]

Similarity Score: 0.0
Model accuracy on test set: 15.00%


Evaluating:  26%|██▌       | 78/300 [02:58<10:20,  2.80s/it]

Similarity Score: 0.6751054852320675
Model accuracy on test set: 15.00%


Evaluating:  26%|██▋       | 79/300 [03:01<10:41,  2.90s/it]

Similarity Score: 0.7854545454545454
Model accuracy on test set: 15.00%


Evaluating:  27%|██▋       | 80/300 [03:03<09:26,  2.57s/it]

Similarity Score: 0.7258064516129032
Model accuracy on test set: 15.00%


Evaluating:  27%|██▋       | 81/300 [03:05<08:49,  2.42s/it]

Similarity Score: 0.8020833333333334
Model accuracy on test set: 15.33%


Evaluating:  27%|██▋       | 82/300 [03:06<07:37,  2.10s/it]

Similarity Score: 0.5735294117647058
Model accuracy on test set: 15.33%


Evaluating:  28%|██▊       | 83/300 [03:08<07:00,  1.94s/it]

Similarity Score: 0.8478260869565217
Model accuracy on test set: 15.67%


Evaluating:  28%|██▊       | 84/300 [03:10<07:34,  2.10s/it]

Similarity Score: 0.7863777089783281
Model accuracy on test set: 15.67%


Evaluating:  28%|██▊       | 85/300 [03:15<10:46,  3.01s/it]

Similarity Score: 0.49865229110512127
Model accuracy on test set: 15.67%


Evaluating:  29%|██▊       | 86/300 [03:18<10:12,  2.86s/it]

Similarity Score: 0.8698630136986302
Model accuracy on test set: 16.00%


Evaluating:  29%|██▉       | 87/300 [03:20<09:36,  2.71s/it]

Similarity Score: 0.5992217898832685
Model accuracy on test set: 16.00%


Evaluating:  29%|██▉       | 88/300 [03:24<10:58,  3.11s/it]

Similarity Score: 0.657258064516129
Model accuracy on test set: 16.00%


Evaluating:  30%|██▉       | 89/300 [03:27<11:02,  3.14s/it]

Similarity Score: 0.46788990825688076
Model accuracy on test set: 16.00%


Evaluating:  30%|███       | 90/300 [03:31<11:03,  3.16s/it]

Similarity Score: 0.7765237020316027
Model accuracy on test set: 16.00%


Evaluating:  30%|███       | 91/300 [03:33<09:49,  2.82s/it]

Similarity Score: 0.759493670886076
Model accuracy on test set: 16.00%


Evaluating:  31%|███       | 92/300 [03:35<09:13,  2.66s/it]

Similarity Score: 0.976
Model accuracy on test set: 16.33%


Evaluating:  31%|███       | 93/300 [03:38<09:27,  2.74s/it]

Similarity Score: 0.7381443298969073
Model accuracy on test set: 16.33%


Evaluating:  31%|███▏      | 94/300 [03:39<07:57,  2.32s/it]

Similarity Score: 0.9894736842105263
Model accuracy on test set: 16.67%


Evaluating:  32%|███▏      | 95/300 [03:41<07:27,  2.18s/it]

Similarity Score: 0.9927007299270073
Model accuracy on test set: 17.00%


Evaluating:  32%|███▏      | 96/300 [03:43<07:07,  2.10s/it]

Similarity Score: 0.7945205479452054
Model accuracy on test set: 17.00%


Evaluating:  32%|███▏      | 97/300 [03:46<07:40,  2.27s/it]

Similarity Score: 0.9963898916967509
Model accuracy on test set: 17.33%


Evaluating:  33%|███▎      | 98/300 [03:47<07:08,  2.12s/it]

Similarity Score: 0.9942196531791907
Model accuracy on test set: 17.67%


Evaluating:  33%|███▎      | 99/300 [03:50<07:08,  2.13s/it]

Similarity Score: 0.994413407821229
Model accuracy on test set: 18.00%


Evaluating:  33%|███▎      | 100/300 [03:53<08:45,  2.63s/it]

Similarity Score: 0.8900523560209425
Model accuracy on test set: 18.33%


Evaluating:  34%|███▎      | 101/300 [03:55<07:58,  2.40s/it]

Similarity Score: 0.8700564971751412
Model accuracy on test set: 18.67%


Evaluating:  34%|███▍      | 102/300 [03:57<07:09,  2.17s/it]

Similarity Score: 0.9927007299270073
Model accuracy on test set: 19.00%


Evaluating:  34%|███▍      | 103/300 [03:58<06:21,  1.93s/it]

Similarity Score: 0.9135802469135802
Model accuracy on test set: 19.33%


Evaluating:  35%|███▍      | 104/300 [04:00<05:58,  1.83s/it]

Similarity Score: 0.9937888198757764
Model accuracy on test set: 19.67%


Evaluating:  35%|███▌      | 105/300 [04:02<05:54,  1.82s/it]

Similarity Score: 0.9953917050691244
Model accuracy on test set: 20.00%


Evaluating:  35%|███▌      | 106/300 [04:03<05:15,  1.63s/it]

Similarity Score: 0.8807339449541285
Model accuracy on test set: 20.33%


Evaluating:  36%|███▌      | 107/300 [04:05<06:12,  1.93s/it]

Similarity Score: 0.8447653429602888
Model accuracy on test set: 20.67%


Evaluating:  36%|███▌      | 108/300 [04:09<07:21,  2.30s/it]

Similarity Score: 0.8086642599277978
Model accuracy on test set: 21.00%


Evaluating:  36%|███▋      | 109/300 [04:10<06:48,  2.14s/it]

Similarity Score: 0.7973856209150327
Model accuracy on test set: 21.00%


Evaluating:  37%|███▋      | 110/300 [04:15<08:53,  2.81s/it]

Similarity Score: 0.2276707530647986
Model accuracy on test set: 21.00%


Evaluating:  37%|███▋      | 111/300 [04:18<09:15,  2.94s/it]

Similarity Score: 0.8035190615835777
Model accuracy on test set: 21.33%


Evaluating:  37%|███▋      | 112/300 [04:21<08:55,  2.85s/it]

Similarity Score: 0.9937106918238994
Model accuracy on test set: 21.67%


Evaluating:  38%|███▊      | 113/300 [04:23<08:27,  2.72s/it]

Similarity Score: 0.6266666666666667
Model accuracy on test set: 21.67%


Evaluating:  38%|███▊      | 114/300 [04:25<08:08,  2.63s/it]

Similarity Score: 0.6011904761904762
Model accuracy on test set: 21.67%


Evaluating:  38%|███▊      | 115/300 [04:27<07:16,  2.36s/it]

Similarity Score: 0.8847926267281107
Model accuracy on test set: 22.00%


Evaluating:  39%|███▊      | 116/300 [04:29<06:51,  2.24s/it]

Similarity Score: 0.8810572687224669
Model accuracy on test set: 22.33%


Evaluating:  39%|███▉      | 117/300 [04:30<05:52,  1.92s/it]

Similarity Score: 0.5755395683453237
Model accuracy on test set: 22.33%


Evaluating:  39%|███▉      | 118/300 [04:33<06:10,  2.03s/it]

Similarity Score: 0.5321100917431193
Model accuracy on test set: 22.33%


Evaluating:  40%|███▉      | 119/300 [04:35<06:04,  2.01s/it]

Similarity Score: 0.6443514644351465
Model accuracy on test set: 22.33%


Evaluating:  40%|████      | 120/300 [04:37<06:23,  2.13s/it]

Similarity Score: 0.6814814814814815
Model accuracy on test set: 22.33%


Evaluating:  40%|████      | 121/300 [04:39<06:28,  2.17s/it]

Similarity Score: 0.9484536082474226
Model accuracy on test set: 22.67%


Evaluating:  41%|████      | 122/300 [04:41<05:40,  1.91s/it]

Similarity Score: 0.9915966386554622
Model accuracy on test set: 23.00%


Evaluating:  41%|████      | 123/300 [04:44<07:17,  2.47s/it]

Similarity Score: 0.27677100494233936
Model accuracy on test set: 23.00%


Evaluating:  41%|████▏     | 124/300 [04:49<08:59,  3.07s/it]

Similarity Score: 0.3062730627306273
Model accuracy on test set: 23.00%


Evaluating:  42%|████▏     | 125/300 [04:51<08:07,  2.79s/it]

Similarity Score: 0.9370629370629371
Model accuracy on test set: 23.33%


Evaluating:  42%|████▏     | 126/300 [04:54<08:01,  2.77s/it]

Similarity Score: 0.9447852760736196
Model accuracy on test set: 23.67%


Evaluating:  42%|████▏     | 127/300 [04:56<08:03,  2.80s/it]

Similarity Score: 0.9724137931034482
Model accuracy on test set: 24.00%


Evaluating:  43%|████▎     | 128/300 [05:00<08:47,  3.06s/it]

Similarity Score: 0.5775401069518716
Model accuracy on test set: 24.00%


Evaluating:  43%|████▎     | 129/300 [05:02<07:57,  2.79s/it]

Similarity Score: 0.8436018957345972
Model accuracy on test set: 24.33%


Evaluating:  43%|████▎     | 130/300 [05:03<06:25,  2.27s/it]

Similarity Score: 0.989247311827957
Model accuracy on test set: 24.67%


Evaluating:  44%|████▎     | 131/300 [05:05<05:58,  2.12s/it]

Similarity Score: 0.9941520467836257
Model accuracy on test set: 25.00%


Evaluating:  44%|████▍     | 132/300 [05:07<05:48,  2.07s/it]

Similarity Score: 0.9948186528497409
Model accuracy on test set: 25.33%


Evaluating:  44%|████▍     | 133/300 [05:10<06:06,  2.19s/it]

Similarity Score: 0.6964285714285714
Model accuracy on test set: 25.33%


Evaluating:  45%|████▍     | 134/300 [05:12<05:51,  2.12s/it]

Similarity Score: 0.6354166666666666
Model accuracy on test set: 25.33%


Evaluating:  45%|████▌     | 135/300 [05:14<05:50,  2.12s/it]

Similarity Score: 0.41358024691358025
Model accuracy on test set: 25.33%


Evaluating:  45%|████▌     | 136/300 [05:16<06:17,  2.30s/it]

Similarity Score: 0.7050847457627119
Model accuracy on test set: 25.33%


Evaluating:  46%|████▌     | 137/300 [05:19<06:15,  2.31s/it]

Similarity Score: 0.9105058365758755
Model accuracy on test set: 25.67%


Evaluating:  46%|████▌     | 138/300 [05:20<05:14,  1.94s/it]

Similarity Score: 0.7692307692307693
Model accuracy on test set: 25.67%


Evaluating:  46%|████▋     | 139/300 [05:22<05:37,  2.10s/it]

Similarity Score: 0.9575757575757575
Model accuracy on test set: 26.00%


Evaluating:  47%|████▋     | 140/300 [05:23<04:40,  1.75s/it]

Similarity Score: 0.9896907216494846
Model accuracy on test set: 26.33%


Evaluating:  47%|████▋     | 141/300 [05:27<05:55,  2.23s/it]

Similarity Score: 0.6211180124223602
Model accuracy on test set: 26.33%


Evaluating:  47%|████▋     | 142/300 [05:31<07:44,  2.94s/it]

Similarity Score: 0.0
Model accuracy on test set: 26.33%


Evaluating:  48%|████▊     | 143/300 [05:34<07:39,  2.92s/it]

Similarity Score: 0.805668016194332
Model accuracy on test set: 26.67%


Evaluating:  48%|████▊     | 144/300 [05:38<08:13,  3.16s/it]

Similarity Score: 0.8467532467532467
Model accuracy on test set: 27.00%


Evaluating:  48%|████▊     | 145/300 [05:40<07:37,  2.95s/it]

Similarity Score: 0.7452471482889734
Model accuracy on test set: 27.00%


Evaluating:  49%|████▊     | 146/300 [05:43<07:27,  2.91s/it]

Similarity Score: 0.9090909090909091
Model accuracy on test set: 27.33%


Evaluating:  49%|████▉     | 147/300 [05:46<07:30,  2.94s/it]

Similarity Score: 0.7076023391812866
Model accuracy on test set: 27.33%


Evaluating:  49%|████▉     | 148/300 [05:47<06:04,  2.40s/it]

Similarity Score: 0.9824561403508771
Model accuracy on test set: 27.67%


Evaluating:  50%|████▉     | 149/300 [05:50<06:27,  2.57s/it]

Similarity Score: 0.5305039787798409
Model accuracy on test set: 27.67%


Evaluating:  50%|█████     | 150/300 [05:52<06:10,  2.47s/it]

Similarity Score: 0.40771349862258954
Model accuracy on test set: 27.67%


Evaluating:  50%|█████     | 151/300 [05:54<05:27,  2.20s/it]

Similarity Score: 0.9289617486338798
Model accuracy on test set: 28.00%


Evaluating:  51%|█████     | 152/300 [05:56<05:34,  2.26s/it]

Similarity Score: 0.9592476489028213
Model accuracy on test set: 28.33%


Evaluating:  51%|█████     | 153/300 [05:58<05:13,  2.13s/it]

Similarity Score: 0.83
Model accuracy on test set: 28.67%


Evaluating:  51%|█████▏    | 154/300 [06:01<05:20,  2.20s/it]

Similarity Score: 0.6267605633802817
Model accuracy on test set: 28.67%


Evaluating:  52%|█████▏    | 155/300 [06:04<06:08,  2.54s/it]

Similarity Score: 0.8034188034188035
Model accuracy on test set: 29.00%


Evaluating:  52%|█████▏    | 156/300 [06:09<07:50,  3.27s/it]

Similarity Score: 0.7932203389830509
Model accuracy on test set: 29.00%


Evaluating:  52%|█████▏    | 157/300 [06:12<07:27,  3.13s/it]

Similarity Score: 0.8275862068965517
Model accuracy on test set: 29.33%


Evaluating:  53%|█████▎    | 158/300 [06:13<06:12,  2.62s/it]

Similarity Score: 0.9945945945945946
Model accuracy on test set: 29.67%


Evaluating:  53%|█████▎    | 159/300 [06:16<06:02,  2.57s/it]

Similarity Score: 0.8398576512455516
Model accuracy on test set: 30.00%


Evaluating:  53%|█████▎    | 160/300 [06:21<07:42,  3.30s/it]

Similarity Score: 0.9182608695652174
Model accuracy on test set: 30.33%


Evaluating:  54%|█████▎    | 161/300 [06:22<06:37,  2.86s/it]

Similarity Score: 0.9951690821256038
Model accuracy on test set: 30.67%


Evaluating:  54%|█████▍    | 162/300 [06:24<05:23,  2.34s/it]

Similarity Score: 0.9876543209876543
Model accuracy on test set: 31.00%


Evaluating:  54%|█████▍    | 163/300 [06:25<04:57,  2.17s/it]

Similarity Score: 0.9937888198757764
Model accuracy on test set: 31.33%


Evaluating:  55%|█████▍    | 164/300 [06:27<04:25,  1.95s/it]

Similarity Score: 0.5789473684210527
Model accuracy on test set: 31.33%


Evaluating:  55%|█████▌    | 165/300 [06:28<04:12,  1.87s/it]

Similarity Score: 0.9940119760479041
Model accuracy on test set: 31.67%


Evaluating:  55%|█████▌    | 166/300 [06:34<06:30,  2.91s/it]

Similarity Score: 0.484375
Model accuracy on test set: 31.67%


Evaluating:  56%|█████▌    | 167/300 [06:35<05:33,  2.50s/it]

Similarity Score: 0.9942857142857143
Model accuracy on test set: 32.00%


Evaluating:  56%|█████▌    | 168/300 [06:37<04:52,  2.22s/it]

Similarity Score: 0.9922480620155039
Model accuracy on test set: 32.33%


Evaluating:  56%|█████▋    | 169/300 [06:39<04:48,  2.20s/it]

Similarity Score: 0.8584905660377359
Model accuracy on test set: 32.67%


Evaluating:  57%|█████▋    | 170/300 [06:43<05:45,  2.66s/it]

Similarity Score: 0.3942505133470226
Model accuracy on test set: 32.67%


Evaluating:  57%|█████▋    | 171/300 [06:46<05:48,  2.70s/it]

Similarity Score: 0.5898123324396782
Model accuracy on test set: 32.67%


Evaluating:  57%|█████▋    | 172/300 [06:48<05:20,  2.50s/it]

Similarity Score: 0.75
Model accuracy on test set: 32.67%


Evaluating:  58%|█████▊    | 173/300 [06:50<04:58,  2.35s/it]

Similarity Score: 0.6529209621993127
Model accuracy on test set: 32.67%


Evaluating:  58%|█████▊    | 174/300 [06:52<04:58,  2.37s/it]

Similarity Score: 0.995260663507109
Model accuracy on test set: 33.00%


Evaluating:  58%|█████▊    | 175/300 [06:54<04:32,  2.18s/it]

Similarity Score: 0.9128205128205128
Model accuracy on test set: 33.33%


Evaluating:  59%|█████▊    | 176/300 [06:57<05:14,  2.54s/it]

Similarity Score: 0.8446215139442231
Model accuracy on test set: 33.67%


Evaluating:  59%|█████▉    | 177/300 [06:58<04:14,  2.07s/it]

Similarity Score: 0.8333333333333334
Model accuracy on test set: 34.00%


Evaluating:  59%|█████▉    | 178/300 [06:59<03:48,  1.87s/it]

Similarity Score: 0.9937106918238994
Model accuracy on test set: 34.33%


Evaluating:  60%|█████▉    | 179/300 [07:01<03:25,  1.70s/it]

Similarity Score: 0.7532467532467533
Model accuracy on test set: 34.33%


Evaluating:  60%|██████    | 180/300 [07:02<03:22,  1.68s/it]

Similarity Score: 0.9090909090909091
Model accuracy on test set: 34.67%


Evaluating:  60%|██████    | 181/300 [07:04<03:28,  1.75s/it]

Similarity Score: 0.9953488372093023
Model accuracy on test set: 35.00%


Evaluating:  61%|██████    | 182/300 [07:10<05:46,  2.93s/it]

Similarity Score: 0.0
Model accuracy on test set: 35.00%


Evaluating:  61%|██████    | 183/300 [07:12<04:54,  2.52s/it]

Similarity Score: 0.994475138121547
Model accuracy on test set: 35.33%


Evaluating:  61%|██████▏   | 184/300 [07:15<05:21,  2.77s/it]

Similarity Score: 0.6057692307692307
Model accuracy on test set: 35.33%


Evaluating:  62%|██████▏   | 185/300 [07:17<05:01,  2.62s/it]

Similarity Score: 0.9473684210526315
Model accuracy on test set: 35.67%


Evaluating:  62%|██████▏   | 186/300 [07:19<04:19,  2.27s/it]

Similarity Score: 0.9929078014184397
Model accuracy on test set: 36.00%


Evaluating:  62%|██████▏   | 187/300 [07:20<03:50,  2.04s/it]

Similarity Score: 0.9941520467836257
Model accuracy on test set: 36.33%


Evaluating:  63%|██████▎   | 188/300 [07:22<03:48,  2.04s/it]

Similarity Score: 0.3136094674556213
Model accuracy on test set: 36.33%


Evaluating:  63%|██████▎   | 189/300 [07:25<04:07,  2.23s/it]

Similarity Score: 0.9473684210526315
Model accuracy on test set: 36.67%


Evaluating:  63%|██████▎   | 190/300 [07:28<04:24,  2.40s/it]

Similarity Score: 0.8300395256916996
Model accuracy on test set: 37.00%


Evaluating:  64%|██████▎   | 191/300 [07:29<03:55,  2.16s/it]

Similarity Score: 0.8865979381443299
Model accuracy on test set: 37.33%


Evaluating:  64%|██████▍   | 192/300 [07:31<03:26,  1.91s/it]

Similarity Score: 0.9924812030075187
Model accuracy on test set: 37.67%


Evaluating:  64%|██████▍   | 193/300 [07:34<04:10,  2.35s/it]

Similarity Score: 0.6812749003984063
Model accuracy on test set: 37.67%


Evaluating:  65%|██████▍   | 194/300 [07:36<03:58,  2.25s/it]

Similarity Score: 0.9820359281437125
Model accuracy on test set: 38.00%


Evaluating:  65%|██████▌   | 195/300 [07:39<04:22,  2.50s/it]

Similarity Score: 0.9271137026239067
Model accuracy on test set: 38.33%


Evaluating:  65%|██████▌   | 196/300 [07:42<04:19,  2.49s/it]

Similarity Score: 0.3688760806916426
Model accuracy on test set: 38.33%


Evaluating:  66%|██████▌   | 197/300 [07:43<03:30,  2.04s/it]

Similarity Score: 0.9906542056074766
Model accuracy on test set: 38.67%


Evaluating:  66%|██████▌   | 198/300 [07:45<03:31,  2.08s/it]

Similarity Score: 0.8898678414096917
Model accuracy on test set: 39.00%


Evaluating:  66%|██████▋   | 199/300 [07:47<03:51,  2.29s/it]

Similarity Score: 0.8416988416988417
Model accuracy on test set: 39.33%


Evaluating:  67%|██████▋   | 200/300 [07:50<04:09,  2.49s/it]

Similarity Score: 0.7876712328767124
Model accuracy on test set: 39.33%


Evaluating:  67%|██████▋   | 201/300 [07:52<03:25,  2.07s/it]

Similarity Score: 0.9914529914529915
Model accuracy on test set: 39.67%


Evaluating:  67%|██████▋   | 202/300 [07:54<03:36,  2.21s/it]

Similarity Score: 0.7896678966789668
Model accuracy on test set: 39.67%


Evaluating:  68%|██████▊   | 203/300 [07:59<04:42,  2.91s/it]

Similarity Score: 0.0
Model accuracy on test set: 39.67%


Evaluating:  68%|██████▊   | 204/300 [08:01<04:24,  2.75s/it]

Similarity Score: 0.5294117647058824
Model accuracy on test set: 39.67%


Evaluating:  68%|██████▊   | 205/300 [08:04<04:14,  2.68s/it]

Similarity Score: 0.2792607802874743
Model accuracy on test set: 39.67%


Evaluating:  69%|██████▊   | 206/300 [08:06<04:08,  2.65s/it]

Similarity Score: 0.9962825278810409
Model accuracy on test set: 40.00%


Evaluating:  69%|██████▉   | 207/300 [08:09<04:19,  2.79s/it]

Similarity Score: 0.8644501278772379
Model accuracy on test set: 40.33%


Evaluating:  69%|██████▉   | 208/300 [08:10<03:33,  2.32s/it]

Similarity Score: 0.7103825136612022
Model accuracy on test set: 40.33%


Evaluating:  70%|██████▉   | 209/300 [08:11<02:54,  1.92s/it]

Similarity Score: 0.9166666666666666
Model accuracy on test set: 40.67%


Evaluating:  70%|███████   | 210/300 [08:13<02:53,  1.93s/it]

Similarity Score: 0.991869918699187
Model accuracy on test set: 41.00%


Evaluating:  70%|███████   | 211/300 [08:14<02:24,  1.63s/it]

Similarity Score: 0.9824561403508771
Model accuracy on test set: 41.33%


Evaluating:  71%|███████   | 212/300 [08:16<02:21,  1.61s/it]

Similarity Score: 0.9433962264150944
Model accuracy on test set: 41.67%


Evaluating:  71%|███████   | 213/300 [08:17<02:20,  1.61s/it]

Similarity Score: 0.641860465116279
Model accuracy on test set: 41.67%


Evaluating:  71%|███████▏  | 214/300 [08:19<02:16,  1.59s/it]

Similarity Score: 0.993006993006993
Model accuracy on test set: 42.00%


Evaluating:  72%|███████▏  | 215/300 [08:20<01:57,  1.38s/it]

Similarity Score: 0.9855072463768116
Model accuracy on test set: 42.33%


Evaluating:  72%|███████▏  | 216/300 [08:22<02:05,  1.50s/it]

Similarity Score: 0.9939393939393939
Model accuracy on test set: 42.67%


Evaluating:  72%|███████▏  | 217/300 [08:23<01:58,  1.43s/it]

Similarity Score: 0.31970260223048325
Model accuracy on test set: 42.67%


Evaluating:  73%|███████▎  | 218/300 [08:24<01:55,  1.41s/it]

Similarity Score: 0.9921259842519685
Model accuracy on test set: 43.00%


Evaluating:  73%|███████▎  | 219/300 [08:27<02:31,  1.88s/it]

Similarity Score: 0.7860262008733624
Model accuracy on test set: 43.00%


Evaluating:  73%|███████▎  | 220/300 [08:29<02:28,  1.86s/it]

Similarity Score: 0.9489795918367347
Model accuracy on test set: 43.33%


Evaluating:  74%|███████▎  | 221/300 [08:31<02:18,  1.75s/it]

Similarity Score: 0.9915966386554622
Model accuracy on test set: 43.67%


Evaluating:  74%|███████▍  | 222/300 [08:33<02:26,  1.88s/it]

Similarity Score: 0.7670250896057348
Model accuracy on test set: 43.67%


Evaluating:  74%|███████▍  | 223/300 [08:35<02:29,  1.94s/it]

Similarity Score: 0.996078431372549
Model accuracy on test set: 44.00%


Evaluating:  75%|███████▍  | 224/300 [08:38<02:58,  2.35s/it]

Similarity Score: 0.8642659279778393
Model accuracy on test set: 44.33%


Evaluating:  75%|███████▌  | 225/300 [08:39<02:32,  2.03s/it]

Similarity Score: 0.9885057471264368
Model accuracy on test set: 44.67%


Evaluating:  75%|███████▌  | 226/300 [08:43<03:08,  2.54s/it]

Similarity Score: 0.8431372549019608
Model accuracy on test set: 45.00%


Evaluating:  76%|███████▌  | 227/300 [08:45<02:56,  2.42s/it]

Similarity Score: 0.875968992248062
Model accuracy on test set: 45.33%


Evaluating:  76%|███████▌  | 228/300 [08:50<03:37,  3.03s/it]

Similarity Score: 0.12698412698412698
Model accuracy on test set: 45.33%


Evaluating:  76%|███████▋  | 229/300 [08:55<04:17,  3.63s/it]

Similarity Score: 0.6214099216710183
Model accuracy on test set: 45.33%


Evaluating:  77%|███████▋  | 230/300 [08:57<03:45,  3.23s/it]

Similarity Score: 0.7910447761194029
Model accuracy on test set: 45.33%


Evaluating:  77%|███████▋  | 231/300 [08:59<03:24,  2.97s/it]

Similarity Score: 0.8
Model accuracy on test set: 45.33%


Evaluating:  77%|███████▋  | 232/300 [09:02<03:20,  2.95s/it]

Similarity Score: 0.7612293144208038
Model accuracy on test set: 45.33%


Evaluating:  78%|███████▊  | 233/300 [09:05<03:10,  2.84s/it]

Similarity Score: 0.9971988795518207
Model accuracy on test set: 45.67%


Evaluating:  78%|███████▊  | 234/300 [09:07<03:00,  2.73s/it]

Similarity Score: 0.6010928961748634
Model accuracy on test set: 45.67%


Evaluating:  78%|███████▊  | 235/300 [09:09<02:27,  2.26s/it]

Similarity Score: 0.9900990099009901
Model accuracy on test set: 46.00%


Evaluating:  79%|███████▊  | 236/300 [09:10<02:06,  1.98s/it]

Similarity Score: 0.9933774834437086
Model accuracy on test set: 46.33%


Evaluating:  79%|███████▉  | 237/300 [09:13<02:21,  2.25s/it]

Similarity Score: 0.9960159362549801
Model accuracy on test set: 46.67%


Evaluating:  79%|███████▉  | 238/300 [09:17<03:04,  2.98s/it]

Similarity Score: 0.0
Model accuracy on test set: 46.67%


Evaluating:  80%|███████▉  | 239/300 [09:20<02:57,  2.91s/it]

Similarity Score: 0.8283582089552238
Model accuracy on test set: 47.00%


Evaluating:  80%|████████  | 240/300 [09:22<02:42,  2.71s/it]

Similarity Score: 0.6754385964912281
Model accuracy on test set: 47.00%


Evaluating:  80%|████████  | 241/300 [09:26<02:46,  2.81s/it]

Similarity Score: 0.4265402843601896
Model accuracy on test set: 47.00%


Evaluating:  81%|████████  | 242/300 [09:28<02:37,  2.72s/it]

Similarity Score: 0.9957805907172996
Model accuracy on test set: 47.33%


Evaluating:  81%|████████  | 243/300 [09:29<02:10,  2.28s/it]

Similarity Score: 0.993006993006993
Model accuracy on test set: 47.67%


Evaluating:  81%|████████▏ | 244/300 [09:31<02:04,  2.23s/it]

Similarity Score: 0.9938650306748467
Model accuracy on test set: 48.00%


Evaluating:  82%|████████▏ | 245/300 [09:35<02:18,  2.52s/it]

Similarity Score: 0.8167202572347267
Model accuracy on test set: 48.33%


Evaluating:  82%|████████▏ | 246/300 [09:36<01:55,  2.15s/it]

Similarity Score: 0.993103448275862
Model accuracy on test set: 48.67%


Evaluating:  82%|████████▏ | 247/300 [09:38<01:50,  2.09s/it]

Similarity Score: 0.6893939393939394
Model accuracy on test set: 48.67%


Evaluating:  83%|████████▎ | 248/300 [09:40<01:53,  2.17s/it]

Similarity Score: 0.5968992248062015
Model accuracy on test set: 48.67%


Evaluating:  83%|████████▎ | 249/300 [09:43<01:59,  2.34s/it]

Similarity Score: 0.7965616045845272
Model accuracy on test set: 48.67%


Evaluating:  83%|████████▎ | 250/300 [09:45<01:54,  2.30s/it]

Similarity Score: 0.994413407821229
Model accuracy on test set: 49.00%


Evaluating:  84%|████████▎ | 251/300 [09:47<01:43,  2.12s/it]

Similarity Score: 0.5836575875486382
Model accuracy on test set: 49.00%


Evaluating:  84%|████████▍ | 252/300 [09:48<01:29,  1.87s/it]

Similarity Score: 0.9922480620155039
Model accuracy on test set: 49.33%


Evaluating:  84%|████████▍ | 253/300 [09:50<01:31,  1.95s/it]

Similarity Score: 0.9950248756218906
Model accuracy on test set: 49.67%


Evaluating:  85%|████████▍ | 254/300 [09:54<01:54,  2.48s/it]

Similarity Score: 0.5757575757575758
Model accuracy on test set: 49.67%


Evaluating:  85%|████████▌ | 255/300 [09:59<02:30,  3.35s/it]

Similarity Score: 0.0
Model accuracy on test set: 49.67%


Evaluating:  85%|████████▌ | 256/300 [10:02<02:17,  3.13s/it]

Similarity Score: 0.5909090909090909
Model accuracy on test set: 49.67%


Evaluating:  86%|████████▌ | 257/300 [10:03<01:46,  2.48s/it]

Similarity Score: 0.82
Model accuracy on test set: 50.00%


Evaluating:  86%|████████▌ | 258/300 [10:04<01:28,  2.10s/it]

Similarity Score: 0.961038961038961
Model accuracy on test set: 50.33%


Evaluating:  86%|████████▋ | 259/300 [10:08<01:52,  2.75s/it]

Similarity Score: 0.7730337078651686
Model accuracy on test set: 50.33%


Evaluating:  87%|████████▋ | 260/300 [10:09<01:28,  2.22s/it]

Similarity Score: 0.340080971659919
Model accuracy on test set: 50.33%


Evaluating:  87%|████████▋ | 261/300 [10:12<01:28,  2.26s/it]

Similarity Score: 0.9509803921568627
Model accuracy on test set: 50.67%


Evaluating:  87%|████████▋ | 262/300 [10:17<01:54,  3.02s/it]

Similarity Score: 0.0
Model accuracy on test set: 50.67%


Evaluating:  88%|████████▊ | 263/300 [10:18<01:34,  2.56s/it]

Similarity Score: 0.9936305732484076
Model accuracy on test set: 51.00%


Evaluating:  88%|████████▊ | 264/300 [10:20<01:21,  2.27s/it]

Similarity Score: 0.8271604938271605
Model accuracy on test set: 51.33%


Evaluating:  88%|████████▊ | 265/300 [10:22<01:20,  2.31s/it]

Similarity Score: 0.41847826086956524
Model accuracy on test set: 51.33%


Evaluating:  89%|████████▊ | 266/300 [10:23<01:08,  2.03s/it]

Similarity Score: 0.8923076923076924
Model accuracy on test set: 51.67%


Evaluating:  89%|████████▉ | 267/300 [10:27<01:19,  2.42s/it]

Similarity Score: 0.8186813186813187
Model accuracy on test set: 52.00%


Evaluating:  89%|████████▉ | 268/300 [10:30<01:23,  2.60s/it]

Similarity Score: 0.9657320872274143
Model accuracy on test set: 52.33%


Evaluating:  90%|████████▉ | 269/300 [10:31<01:06,  2.15s/it]

Similarity Score: 0.72
Model accuracy on test set: 52.33%


Evaluating:  90%|█████████ | 270/300 [10:32<00:55,  1.85s/it]

Similarity Score: 0.7205882352941176
Model accuracy on test set: 52.33%


Evaluating:  90%|█████████ | 271/300 [10:35<01:02,  2.16s/it]

Similarity Score: 0.6784140969162996
Model accuracy on test set: 52.33%


Evaluating:  91%|█████████ | 272/300 [10:37<00:56,  2.01s/it]

Similarity Score: 0.9081632653061225
Model accuracy on test set: 52.67%


Evaluating:  91%|█████████ | 273/300 [10:38<00:53,  1.97s/it]

Similarity Score: 0.993103448275862
Model accuracy on test set: 53.00%


Evaluating:  91%|█████████▏| 274/300 [10:43<01:12,  2.77s/it]

Similarity Score: 0.0
Model accuracy on test set: 53.00%


Evaluating:  92%|█████████▏| 275/300 [10:45<01:03,  2.53s/it]

Similarity Score: 0.7603305785123967
Model accuracy on test set: 53.00%


Evaluating:  92%|█████████▏| 276/300 [10:47<00:53,  2.24s/it]

Similarity Score: 0.9923664122137404
Model accuracy on test set: 53.33%


Evaluating:  92%|█████████▏| 277/300 [10:48<00:44,  1.93s/it]

Similarity Score: 0.8831168831168831
Model accuracy on test set: 53.67%


Evaluating:  93%|█████████▎| 278/300 [10:50<00:41,  1.90s/it]

Similarity Score: 0.9935483870967742
Model accuracy on test set: 54.00%


Evaluating:  93%|█████████▎| 279/300 [10:53<00:50,  2.42s/it]

Similarity Score: 0.6006825938566553
Model accuracy on test set: 54.00%


Evaluating:  93%|█████████▎| 280/300 [10:56<00:51,  2.60s/it]

Similarity Score: 0.49696969696969695
Model accuracy on test set: 54.00%


Evaluating:  94%|█████████▎| 281/300 [10:59<00:49,  2.60s/it]

Similarity Score: 0.46919431279620855
Model accuracy on test set: 54.00%


Evaluating:  94%|█████████▍| 282/300 [11:00<00:38,  2.13s/it]

Similarity Score: 0.9896907216494846
Model accuracy on test set: 54.33%


Evaluating:  94%|█████████▍| 283/300 [11:01<00:30,  1.82s/it]

Similarity Score: 0.9850746268656716
Model accuracy on test set: 54.67%


Evaluating:  95%|█████████▍| 284/300 [11:03<00:27,  1.73s/it]

Similarity Score: 0.8711656441717791
Model accuracy on test set: 55.00%


Evaluating:  95%|█████████▌| 285/300 [11:04<00:25,  1.67s/it]

Similarity Score: 0.9940828402366864
Model accuracy on test set: 55.33%


Evaluating:  95%|█████████▌| 286/300 [11:05<00:22,  1.59s/it]

Similarity Score: 0.9887640449438202
Model accuracy on test set: 55.67%


Evaluating:  96%|█████████▌| 287/300 [11:08<00:22,  1.76s/it]

Similarity Score: 0.9672727272727273
Model accuracy on test set: 56.00%


Evaluating:  96%|█████████▌| 288/300 [11:10<00:23,  1.98s/it]

Similarity Score: 0.9079365079365079
Model accuracy on test set: 56.33%


Evaluating:  96%|█████████▋| 289/300 [11:11<00:19,  1.75s/it]

Similarity Score: 0.9922480620155039
Model accuracy on test set: 56.67%


Evaluating:  97%|█████████▋| 290/300 [11:13<00:18,  1.88s/it]

Similarity Score: 0.8
Model accuracy on test set: 56.67%


Evaluating:  97%|█████████▋| 291/300 [11:16<00:19,  2.22s/it]

Similarity Score: 0.689108910891089
Model accuracy on test set: 56.67%


Evaluating:  97%|█████████▋| 292/300 [11:19<00:17,  2.20s/it]

Similarity Score: 0.641860465116279
Model accuracy on test set: 56.67%


Evaluating:  98%|█████████▊| 293/300 [11:21<00:14,  2.14s/it]

Similarity Score: 0.9969230769230769
Model accuracy on test set: 57.00%


Evaluating:  98%|█████████▊| 294/300 [11:22<00:11,  1.99s/it]

Similarity Score: 0.9947643979057592
Model accuracy on test set: 57.33%


Evaluating:  98%|█████████▊| 295/300 [11:25<00:11,  2.28s/it]

Similarity Score: 0.5260273972602739
Model accuracy on test set: 57.33%


Evaluating:  99%|█████████▊| 296/300 [11:27<00:08,  2.18s/it]

Similarity Score: 0.93
Model accuracy on test set: 57.67%


Evaluating:  99%|█████████▉| 297/300 [11:29<00:05,  1.96s/it]

Similarity Score: 0.7941176470588235
Model accuracy on test set: 57.67%


Evaluating:  99%|█████████▉| 298/300 [11:32<00:04,  2.42s/it]

Similarity Score: 0.9942196531791907
Model accuracy on test set: 58.00%


Evaluating: 100%|█████████▉| 299/300 [11:34<00:02,  2.33s/it]

Similarity Score: 0.9873417721518988
Model accuracy on test set: 58.33%


Evaluating: 100%|██████████| 300/300 [11:41<00:00,  2.34s/it]

Similarity Score: 0.6974358974358974
Model accuracy on test set: 58.33%
Model accuracy on test set: 58.33%





EXAMPLE OF AN 85% ACCURACY LIKE:

In [None]:
import re
import random
import sqlparse
import difflib
from tqdm import tqdm
from unsloth import FastLanguageModel

range_inference = 300 # Choose any number!
accuracy = 0
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def extract_sql_statement(response_text):
    """
    Extract the SQL statement from the response text.
    """
    # Define the regex pattern to extract the SQL statement
    pattern = r'### Response:\n(.*?);'
    match = re.search(pattern, response_text, re.DOTALL)
    if match:
        # Return the extracted SQL statement
        return match.group(1).strip()
    return ""

def format_sql(sql):
    """
    Format SQL queries to a standard format.
    """
    return sqlparse.format(sql.strip(), reindent=True, keyword_case='upper')

def evaluate_similarity(generated_response, ground_truth_sql):
    """
    Evaluate the similarity between the generated response and ground truth SQL.
    """
    # Extract SQL statement from the generated response
    generated_sql = extract_sql_statement(generated_response)

    # Format both SQL queries for comparison
    formatted_generated_sql = format_sql(generated_sql)
    formatted_ground_truth = format_sql(ground_truth_sql)

    # Calculate similarity score
    similarity = difflib.SequenceMatcher(None, formatted_generated_sql, formatted_ground_truth).ratio()
    return similarity

# Process each example in the test dataset
for example in tqdm(dataset_test.shuffle(seed=154).select([1]), desc="Evaluating"):
    # Create the input prompt
    input_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{example['sql_prompt']}

### Input:
{example['sql_context']}

### Response:
"""

    # Tokenize and prepare the input
    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")

    # Generate the output SQL query
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Evaluate the similarity
    ground_truth_sql = example['sql']
    similarity_score = evaluate_similarity(generated_response, ground_truth_sql)
    print(f"Similarity Score: {similarity_score}")
    # Assuming a similarity threshold of 0.8 for correctness
    if similarity_score > 0.8:
        accuracy += 1
    #accuracy_percentage = (accuracy / range_inference) * 100
    #print(f"Model accuracy on test set: {accuracy_percentage:.2f}%")
    print(generated_response)
    print(ground_truth_sql)

# Calculate and print overall accuracy
accuracy_percentage = (accuracy / range_inference) * 100
print(f"Model accuracy on test set: {accuracy_percentage:.2f}%")


Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]

Similarity Score: 0.8413793103448276
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is the average level of satisfaction for VR games in the 'Gaming' category?

### Input:
CREATE TABLE Games (id INT, name VARCHAR(100), category VARCHAR(50), satisfaction FLOAT);

### Response:
SELECT AVG(satisfaction) FROM Games WHERE category = 'VR' AND name LIKE 'Gaming%';
SELECT AVG(satisfaction) FROM Games WHERE category = 'Gaming';
Model accuracy on test set: 0.33%





In [None]:
import re
import random
import sqlparse
import difflib
from tqdm import tqdm
from unsloth import FastLanguageModel

total_examples = 300#len(dataset_test) # Choose any number!
accuracy_80 = 0
accuracy_90 = 0
accuracy_95 = 0

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def extract_sql_statement(response_text):
    """
    Extract the SQL statement from the response text.
    """
    # Define the regex pattern to extract the SQL statement
    pattern = r'### Response:\n(.*?);'
    match = re.search(pattern, response_text, re.DOTALL)
    if match:
        # Return the extracted SQL statement
        return match.group(1).strip()
    return ""

def format_sql(sql):
    """
    Format SQL queries to a standard format.
    """
    return sqlparse.format(sql.strip(), reindent=True, keyword_case='upper')

def evaluate_similarity(generated_response, ground_truth_sql):
    """
    Evaluate the similarity between the generated response and ground truth SQL.
    """
    # Extract SQL statement from the generated response
    generated_sql = extract_sql_statement(generated_response)

    # Format both SQL queries for comparison
    formatted_generated_sql = format_sql(generated_sql)
    formatted_ground_truth = format_sql(ground_truth_sql)

    # Calculate similarity score
    similarity = difflib.SequenceMatcher(None, formatted_generated_sql, formatted_ground_truth).ratio()
    return similarity

# Process each example in the test dataset
for step, example in enumerate(tqdm(dataset_test.select(range(total_examples)), desc="Evaluating"), 1):
    # Create the input prompt
    input_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{example['sql_prompt']}

### Input:
{example['sql_context']}

### Response:
"""

    # Tokenize and prepare the input
    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")

    # Generate the output SQL query
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Evaluate the similarity
    ground_truth_sql = example['sql']
    similarity_score = evaluate_similarity(generated_response, ground_truth_sql)
    print(f"Similarity Score: {similarity_score}")
    # Assuming a similarity threshold of 0.8 for correctness
    if similarity_score > 0.8:
        accuracy_80 += 1
    if similarity_score > 0.9:
        accuracy_90 += 1
    if similarity_score > 0.95:
        accuracy_95 += 1
    if step % 50 == 0 or step == total_examples:
        accuracy_percentage_80 = (accuracy_80 / step) * 100
        accuracy_percentage_90 = (accuracy_90 / step) * 100
        accuracy_percentage_95 = (accuracy_95 / step) * 100
        print(f"Step {step}:")
        print(f"Model accuracy on test set 80%: {accuracy_percentage_80:.2f}%")
        print(f"Model accuracy on test set 90%: {accuracy_percentage_90:.2f}%")
        print(f"Model accuracy on test set 95%: {accuracy_percentage_95:.2f}%")



# Calculate and print overall accuracy
accuracy_percentage_80 = (accuracy_80 / total_examples) * 100
accuracy_percentage_90 = (accuracy_90 / total_examples) * 100
accuracy_percentage_95 = (accuracy_95 / total_examples) * 100
print(f"Final Model accuracy on test set 80%: {accuracy_percentage_80:.2f}%")
print(f"Final Model accuracy on test set 90%: {accuracy_percentage_90:.2f}%")
print(f"Final Model accuracy on test set 95%: {accuracy_percentage_95:.2f}%")



Evaluating:   0%|          | 1/300 [00:05<28:22,  5.70s/it]

Similarity Score: 0.995475113122172


Evaluating:   1%|          | 2/300 [00:08<21:17,  4.29s/it]

Similarity Score: 0.5133689839572193


Evaluating:   1%|          | 3/300 [00:10<14:53,  3.01s/it]

Similarity Score: 0.9841269841269841


Evaluating:   1%|▏         | 4/300 [00:14<16:05,  3.26s/it]

Similarity Score: 0.9928057553956835


Evaluating:   2%|▏         | 5/300 [00:23<26:31,  5.39s/it]

Similarity Score: 0.45265588914549654


Evaluating:   2%|▏         | 6/300 [00:25<21:22,  4.36s/it]

Similarity Score: 0.9727272727272728


Evaluating:   2%|▏         | 7/300 [00:26<15:49,  3.24s/it]

Similarity Score: 0.9915966386554622


Evaluating:   3%|▎         | 8/300 [00:29<14:57,  3.07s/it]

Similarity Score: 0.8037383177570093


Evaluating:   3%|▎         | 9/300 [00:31<14:18,  2.95s/it]

Similarity Score: 0.9937106918238994


Evaluating:   3%|▎         | 10/300 [00:34<12:56,  2.68s/it]

Similarity Score: 0.703125


Evaluating:   4%|▎         | 11/300 [00:35<10:54,  2.27s/it]

Similarity Score: 0.9931972789115646


Evaluating:   4%|▍         | 12/300 [00:39<14:15,  2.97s/it]

Similarity Score: 0.27941176470588236


Evaluating:   4%|▍         | 13/300 [00:41<11:52,  2.48s/it]

Similarity Score: 0.9027777777777778


Evaluating:   5%|▍         | 14/300 [00:42<10:38,  2.23s/it]

Similarity Score: 0.9915966386554622


Evaluating:   5%|▌         | 15/300 [00:45<10:49,  2.28s/it]

Similarity Score: 0.6397058823529411


Evaluating:   5%|▌         | 16/300 [00:49<13:40,  2.89s/it]

Similarity Score: 0.8235294117647058


Evaluating:   6%|▌         | 17/300 [00:51<12:26,  2.64s/it]

Similarity Score: 0.9956331877729258


Evaluating:   6%|▌         | 18/300 [00:53<11:34,  2.46s/it]

Similarity Score: 0.6772151898734177


Evaluating:   6%|▋         | 19/300 [00:55<10:52,  2.32s/it]

Similarity Score: 0.9447236180904522


Evaluating:   7%|▋         | 20/300 [00:58<11:53,  2.55s/it]

Similarity Score: 0.7209302325581395


Evaluating:   7%|▋         | 21/300 [01:00<09:56,  2.14s/it]

Similarity Score: 0.990990990990991


Evaluating:   7%|▋         | 22/300 [01:04<13:13,  2.85s/it]

Similarity Score: 0.977859778597786


Evaluating:   8%|▊         | 23/300 [01:08<15:02,  3.26s/it]

Similarity Score: 0.6494252873563219


Evaluating:   8%|▊         | 24/300 [01:12<16:06,  3.50s/it]

Similarity Score: 0.44547563805104406


Evaluating:   8%|▊         | 25/300 [01:14<13:51,  3.02s/it]

Similarity Score: 0.9950248756218906


Evaluating:   9%|▊         | 26/300 [01:16<12:26,  2.73s/it]

Similarity Score: 0.9951219512195122


Evaluating:   9%|▉         | 27/300 [01:18<11:22,  2.50s/it]

Similarity Score: 0.9947089947089947


Evaluating:   9%|▉         | 28/300 [01:21<11:01,  2.43s/it]

Similarity Score: 0.9311740890688259


Evaluating:  10%|▉         | 29/300 [01:23<11:13,  2.48s/it]

Similarity Score: 0.9037656903765691


Evaluating:  10%|█         | 30/300 [01:25<10:32,  2.34s/it]

Similarity Score: 0.8658536585365854


Evaluating:  10%|█         | 31/300 [01:30<13:23,  2.99s/it]

Similarity Score: 0.36890243902439024


Evaluating:  11%|█         | 32/300 [01:31<10:49,  2.42s/it]

Similarity Score: 0.9911504424778761


Evaluating:  11%|█         | 33/300 [01:35<12:56,  2.91s/it]

Similarity Score: 0.8113879003558719


Evaluating:  11%|█▏        | 34/300 [01:37<11:23,  2.57s/it]

Similarity Score: 0.703125


Evaluating:  12%|█▏        | 35/300 [01:39<11:09,  2.52s/it]

Similarity Score: 0.6685082872928176


Evaluating:  12%|█▏        | 36/300 [01:42<11:20,  2.58s/it]

Similarity Score: 0.9974160206718347


Evaluating:  12%|█▏        | 37/300 [01:43<09:47,  2.24s/it]

Similarity Score: 0.9937106918238994


Evaluating:  13%|█▎        | 38/300 [01:46<10:21,  2.37s/it]

Similarity Score: 0.7862595419847328


Evaluating:  13%|█▎        | 39/300 [01:49<10:50,  2.49s/it]

Similarity Score: 0.7823529411764706


Evaluating:  13%|█▎        | 40/300 [01:54<14:26,  3.33s/it]

Similarity Score: 0.0


Evaluating:  14%|█▎        | 41/300 [01:57<13:39,  3.16s/it]

Similarity Score: 0.657243816254417


Evaluating:  14%|█▍        | 42/300 [01:58<11:18,  2.63s/it]

Similarity Score: 0.6666666666666666


Evaluating:  14%|█▍        | 43/300 [01:59<09:33,  2.23s/it]

Similarity Score: 0.9940119760479041


Evaluating:  15%|█▍        | 44/300 [02:03<11:54,  2.79s/it]

Similarity Score: 0.6303317535545023


Evaluating:  15%|█▌        | 45/300 [02:09<15:12,  3.58s/it]

Similarity Score: 0.2696245733788396


Evaluating:  15%|█▌        | 46/300 [02:11<13:45,  3.25s/it]

Similarity Score: 0.8011869436201781


Evaluating:  16%|█▌        | 47/300 [02:13<12:00,  2.85s/it]

Similarity Score: 0.5864661654135338


Evaluating:  16%|█▌        | 48/300 [02:20<16:30,  3.93s/it]

Similarity Score: 0.6088794926004228


Evaluating:  16%|█▋        | 49/300 [02:23<16:05,  3.85s/it]

Similarity Score: 0.4277227722772277


Evaluating:  17%|█▋        | 50/300 [02:26<14:34,  3.50s/it]

Similarity Score: 0.6351931330472103
Step 50:
Model accuracy on test set 80%: 54.00%
Model accuracy on test set 90%: 44.00%
Model accuracy on test set 95%: 36.00%


Evaluating:  17%|█▋        | 51/300 [02:27<11:33,  2.79s/it]

Similarity Score: 0.6909090909090909


Evaluating:  17%|█▋        | 52/300 [02:29<09:46,  2.37s/it]

Similarity Score: 0.7402597402597403


Evaluating:  18%|█▊        | 53/300 [02:31<10:22,  2.52s/it]

Similarity Score: 0.7668393782383419


Evaluating:  18%|█▊        | 54/300 [02:33<09:38,  2.35s/it]

Similarity Score: 0.9958847736625515


Evaluating:  18%|█▊        | 55/300 [02:35<09:14,  2.26s/it]

Similarity Score: 0.9130434782608695


Evaluating:  19%|█▊        | 56/300 [02:37<07:47,  1.92s/it]

Similarity Score: 0.98


Evaluating:  19%|█▉        | 57/300 [02:38<06:55,  1.71s/it]

Similarity Score: 0.9915966386554622


Evaluating:  19%|█▉        | 58/300 [02:39<06:14,  1.55s/it]

Similarity Score: 0.9935483870967742


Evaluating:  20%|█▉        | 59/300 [02:42<07:39,  1.91s/it]

Similarity Score: 0.973384030418251


Evaluating:  20%|██        | 60/300 [02:45<08:45,  2.19s/it]

Similarity Score: 0.7254901960784313


Evaluating:  20%|██        | 61/300 [02:47<09:36,  2.41s/it]

Similarity Score: 0.751412429378531


Evaluating:  21%|██        | 62/300 [02:50<09:09,  2.31s/it]

Similarity Score: 0.9041095890410958


Evaluating:  21%|██        | 63/300 [02:53<10:18,  2.61s/it]

Similarity Score: 0.81


Evaluating:  21%|██▏       | 64/300 [02:57<11:59,  3.05s/it]

Similarity Score: 0.23624595469255663


Evaluating:  22%|██▏       | 65/300 [03:00<12:07,  3.10s/it]

Similarity Score: 0.4563758389261745


Evaluating:  22%|██▏       | 66/300 [03:03<12:20,  3.17s/it]

Similarity Score: 0.8764044943820225


Evaluating:  22%|██▏       | 67/300 [03:06<11:10,  2.88s/it]

Similarity Score: 0.9957446808510638


Evaluating:  23%|██▎       | 68/300 [03:08<10:37,  2.75s/it]

Similarity Score: 0.8945686900958466


Evaluating:  23%|██▎       | 69/300 [03:10<09:51,  2.56s/it]

Similarity Score: 0.8271604938271605


Evaluating:  23%|██▎       | 70/300 [03:13<10:25,  2.72s/it]

Similarity Score: 0.4021164021164021


Evaluating:  24%|██▎       | 71/300 [03:16<10:22,  2.72s/it]

Similarity Score: 0.7790697674418605


Evaluating:  24%|██▍       | 72/300 [03:18<09:28,  2.49s/it]

Similarity Score: 0.7012987012987013


Evaluating:  24%|██▍       | 73/300 [03:23<12:31,  3.31s/it]

Similarity Score: 0.6701846965699209


Evaluating:  25%|██▍       | 74/300 [03:30<15:56,  4.23s/it]

Similarity Score: 0.7916666666666666


Evaluating:  25%|██▌       | 75/300 [03:33<15:05,  4.02s/it]

Similarity Score: 0.8780487804878049


Evaluating:  25%|██▌       | 76/300 [03:36<13:23,  3.59s/it]

Similarity Score: 0.6354166666666666


Evaluating:  26%|██▌       | 77/300 [03:48<22:40,  6.10s/it]

Similarity Score: 0.5119305856832972


Evaluating:  26%|██▌       | 78/300 [03:51<19:59,  5.40s/it]

Similarity Score: 0.7909604519774012


Evaluating:  26%|██▋       | 79/300 [03:54<16:31,  4.49s/it]

Similarity Score: 0.8888888888888888


Evaluating:  27%|██▋       | 80/300 [03:57<15:18,  4.18s/it]

Similarity Score: 0.8427299703264095


Evaluating:  27%|██▋       | 81/300 [04:02<15:20,  4.20s/it]

Similarity Score: 0.6051282051282051


Evaluating:  27%|██▋       | 82/300 [04:03<11:45,  3.24s/it]

Similarity Score: 0.9855072463768116


Evaluating:  28%|██▊       | 83/300 [04:07<12:32,  3.47s/it]

Similarity Score: 0.28635346756152125


Evaluating:  28%|██▊       | 84/300 [04:09<11:48,  3.28s/it]

Similarity Score: 0.4827586206896552


Evaluating:  28%|██▊       | 85/300 [04:11<10:10,  2.84s/it]

Similarity Score: 0.9942196531791907


Evaluating:  29%|██▊       | 86/300 [04:14<10:29,  2.94s/it]

Similarity Score: 0.7285714285714285


Evaluating:  29%|██▉       | 87/300 [04:16<09:17,  2.62s/it]

Similarity Score: 0.9949748743718593


Evaluating:  29%|██▉       | 88/300 [04:18<07:52,  2.23s/it]

Similarity Score: 0.8227848101265823


Evaluating:  30%|██▉       | 89/300 [04:20<08:25,  2.40s/it]

Similarity Score: 0.8413284132841329


Evaluating:  30%|███       | 90/300 [04:23<08:20,  2.38s/it]

Similarity Score: 0.9225352112676056


Evaluating:  30%|███       | 91/300 [04:24<07:14,  2.08s/it]

Similarity Score: 0.8831168831168831


Evaluating:  31%|███       | 92/300 [04:25<06:33,  1.89s/it]

Similarity Score: 0.9939393939393939


Evaluating:  31%|███       | 93/300 [04:27<05:58,  1.73s/it]

Similarity Score: 0.9583333333333334


Evaluating:  31%|███▏      | 94/300 [04:30<07:16,  2.12s/it]

Similarity Score: 0.9962264150943396


Evaluating:  32%|███▏      | 95/300 [04:32<06:59,  2.05s/it]

Similarity Score: 0.5092250922509225


Evaluating:  32%|███▏      | 96/300 [04:33<06:36,  1.94s/it]

Similarity Score: 0.9917355371900827


Evaluating:  32%|███▏      | 97/300 [04:36<07:26,  2.20s/it]

Similarity Score: 0.34877384196185285


Evaluating:  33%|███▎      | 98/300 [04:40<08:31,  2.53s/it]

Similarity Score: 0.4660633484162896


Evaluating:  33%|███▎      | 99/300 [04:41<07:24,  2.21s/it]

Similarity Score: 0.9942857142857143


Evaluating:  33%|███▎      | 100/300 [04:43<06:54,  2.07s/it]

Similarity Score: 0.9945945945945946
Step 100:
Model accuracy on test set 80%: 55.00%
Model accuracy on test set 90%: 40.00%
Model accuracy on test set 95%: 33.00%


Evaluating:  34%|███▎      | 101/300 [04:46<08:09,  2.46s/it]

Similarity Score: 0.7493112947658402


Evaluating:  34%|███▍      | 102/300 [04:49<08:50,  2.68s/it]

Similarity Score: 0.8976377952755905


Evaluating:  34%|███▍      | 103/300 [04:51<08:07,  2.47s/it]

Similarity Score: 0.4888888888888889


Evaluating:  35%|███▍      | 104/300 [04:53<07:40,  2.35s/it]

Similarity Score: 0.7973856209150327


Evaluating:  35%|███▌      | 105/300 [04:55<06:27,  1.99s/it]

Similarity Score: 0.991869918699187


Evaluating:  35%|███▌      | 106/300 [04:58<08:19,  2.57s/it]

Similarity Score: 0.7241379310344828


Evaluating:  36%|███▌      | 107/300 [05:01<08:13,  2.55s/it]

Similarity Score: 0.7310344827586207


Evaluating:  36%|███▌      | 108/300 [05:02<07:02,  2.20s/it]

Similarity Score: 0.9923664122137404


Evaluating:  36%|███▋      | 109/300 [05:04<06:06,  1.92s/it]

Similarity Score: 0.9906542056074766


Evaluating:  37%|███▋      | 110/300 [05:05<05:07,  1.62s/it]

Similarity Score: 0.4444444444444444


Evaluating:  37%|███▋      | 111/300 [05:07<05:46,  1.83s/it]

Similarity Score: 0.996742671009772


Evaluating:  37%|███▋      | 112/300 [05:08<04:56,  1.57s/it]

Similarity Score: 0.6050420168067226


Evaluating:  38%|███▊      | 113/300 [05:10<05:25,  1.74s/it]

Similarity Score: 0.8802588996763754


Evaluating:  38%|███▊      | 114/300 [05:11<05:12,  1.68s/it]

Similarity Score: 0.9940119760479041


Evaluating:  38%|███▊      | 115/300 [05:17<08:44,  2.84s/it]

Similarity Score: 0.5098901098901099


Evaluating:  39%|███▊      | 116/300 [05:19<07:27,  2.43s/it]

Similarity Score: 0.6595744680851063


Evaluating:  39%|███▉      | 117/300 [05:21<07:41,  2.52s/it]

Similarity Score: 0.8048780487804879


Evaluating:  39%|███▉      | 118/300 [05:23<06:46,  2.23s/it]

Similarity Score: 0.5410628019323671


Evaluating:  40%|███▉      | 119/300 [05:26<07:13,  2.39s/it]

Similarity Score: 0.7264957264957265


Evaluating:  40%|████      | 120/300 [05:30<09:23,  3.13s/it]

Similarity Score: 0.5741444866920152


Evaluating:  40%|████      | 121/300 [05:32<07:44,  2.60s/it]

Similarity Score: 0.5726495726495726


Evaluating:  41%|████      | 122/300 [05:34<07:36,  2.57s/it]

Similarity Score: 0.9635627530364372


Evaluating:  41%|████      | 123/300 [05:36<06:43,  2.28s/it]

Similarity Score: 0.993006993006993


Evaluating:  41%|████▏     | 124/300 [05:39<07:02,  2.40s/it]

Similarity Score: 0.99644128113879


Evaluating:  42%|████▏     | 125/300 [05:40<06:18,  2.16s/it]

Similarity Score: 0.993103448275862


Evaluating:  42%|████▏     | 126/300 [05:42<06:10,  2.13s/it]

Similarity Score: 0.9947089947089947


Evaluating:  42%|████▏     | 127/300 [05:44<05:52,  2.04s/it]

Similarity Score: 0.8909090909090909


Evaluating:  43%|████▎     | 128/300 [05:46<05:30,  1.92s/it]

Similarity Score: 0.9152542372881356


Evaluating:  43%|████▎     | 129/300 [05:47<04:52,  1.71s/it]

Similarity Score: 0.9929078014184397


Evaluating:  43%|████▎     | 130/300 [05:48<04:38,  1.64s/it]

Similarity Score: 0.7682119205298014


Evaluating:  44%|████▎     | 131/300 [05:52<05:54,  2.10s/it]

Similarity Score: 0.6630434782608695


Evaluating:  44%|████▍     | 132/300 [05:58<09:53,  3.54s/it]

Similarity Score: 0.7146067415730337


Evaluating:  44%|████▍     | 133/300 [06:01<08:59,  3.23s/it]

Similarity Score: 0.9278350515463918


Evaluating:  45%|████▍     | 134/300 [06:03<07:35,  2.75s/it]

Similarity Score: 0.9935483870967742


Evaluating:  45%|████▌     | 135/300 [06:06<08:02,  2.92s/it]

Similarity Score: 0.41397849462365593


Evaluating:  45%|████▌     | 136/300 [06:08<07:39,  2.80s/it]

Similarity Score: 1.0


Evaluating:  46%|████▌     | 137/300 [06:11<07:41,  2.83s/it]

Similarity Score: 0.7905882352941176


Evaluating:  46%|████▌     | 138/300 [06:14<07:22,  2.73s/it]

Similarity Score: 0.41044776119402987


Evaluating:  46%|████▋     | 139/300 [06:16<06:41,  2.49s/it]

Similarity Score: 0.9953488372093023


Evaluating:  47%|████▋     | 140/300 [06:17<05:51,  2.20s/it]

Similarity Score: 0.7203389830508474


Evaluating:  47%|████▋     | 141/300 [06:20<06:05,  2.30s/it]

Similarity Score: 0.6125461254612546


Evaluating:  47%|████▋     | 142/300 [06:22<05:52,  2.23s/it]

Similarity Score: 0.9943502824858758


Evaluating:  48%|████▊     | 143/300 [06:25<06:40,  2.55s/it]

Similarity Score: 0.3881578947368421


Evaluating:  48%|████▊     | 144/300 [06:28<06:46,  2.60s/it]

Similarity Score: 0.856140350877193


Evaluating:  48%|████▊     | 145/300 [06:30<06:01,  2.33s/it]

Similarity Score: 0.90625


Evaluating:  49%|████▊     | 146/300 [06:31<05:20,  2.08s/it]

Similarity Score: 0.6964285714285714


Evaluating:  49%|████▉     | 147/300 [06:37<08:13,  3.23s/it]

Similarity Score: 0.0


Evaluating:  49%|████▉     | 148/300 [06:40<07:38,  3.01s/it]

Similarity Score: 0.9107981220657277


Evaluating:  50%|████▉     | 149/300 [06:41<06:38,  2.64s/it]

Similarity Score: 0.5291828793774319


Evaluating:  50%|█████     | 150/300 [06:43<06:14,  2.50s/it]

Similarity Score: 0.9938650306748467
Step 150:
Model accuracy on test set 80%: 53.33%
Model accuracy on test set 90%: 40.00%
Model accuracy on test set 95%: 32.67%


Evaluating:  50%|█████     | 151/300 [06:46<05:55,  2.38s/it]

Similarity Score: 0.992


Evaluating:  51%|█████     | 152/300 [06:48<06:08,  2.49s/it]

Similarity Score: 0.9904761904761905


Evaluating:  51%|█████     | 153/300 [06:51<06:22,  2.60s/it]

Similarity Score: 0.8035714285714286


Evaluating:  51%|█████▏    | 154/300 [06:54<06:15,  2.57s/it]

Similarity Score: 0.9779179810725552


Evaluating:  52%|█████▏    | 155/300 [06:55<05:32,  2.29s/it]

Similarity Score: 0.9937106918238994


Evaluating:  52%|█████▏    | 156/300 [06:57<04:52,  2.03s/it]

Similarity Score: 0.8859060402684564


Evaluating:  52%|█████▏    | 157/300 [06:58<04:25,  1.86s/it]

Similarity Score: 0.9942857142857143


Evaluating:  53%|█████▎    | 158/300 [07:00<04:36,  1.95s/it]

Similarity Score: 0.6103896103896104


Evaluating:  53%|█████▎    | 159/300 [07:03<04:48,  2.05s/it]

Similarity Score: 0.9939393939393939


Evaluating:  53%|█████▎    | 160/300 [07:05<04:40,  2.01s/it]

Similarity Score: 0.9949748743718593


Evaluating:  54%|█████▎    | 161/300 [07:06<04:31,  1.95s/it]

Similarity Score: 0.993103448275862


Evaluating:  54%|█████▍    | 162/300 [07:08<04:02,  1.76s/it]

Similarity Score: 0.990990990990991


Evaluating:  54%|█████▍    | 163/300 [07:09<03:35,  1.58s/it]

Similarity Score: 0.991869918699187


Evaluating:  55%|█████▍    | 164/300 [07:11<03:52,  1.71s/it]

Similarity Score: 0.9306122448979591


Evaluating:  55%|█████▌    | 165/300 [07:12<03:36,  1.60s/it]

Similarity Score: 0.9896907216494846


Evaluating:  55%|█████▌    | 166/300 [07:15<04:08,  1.85s/it]

Similarity Score: 0.9942857142857143


Evaluating:  56%|█████▌    | 167/300 [07:16<03:32,  1.60s/it]

Similarity Score: 0.9850746268656716


Evaluating:  56%|█████▌    | 168/300 [07:17<03:39,  1.66s/it]

Similarity Score: 0.9911504424778761


Evaluating:  56%|█████▋    | 169/300 [07:19<03:40,  1.69s/it]

Similarity Score: 0.9922480620155039


Evaluating:  57%|█████▋    | 170/300 [07:21<03:52,  1.79s/it]

Similarity Score: 0.47257383966244726


Evaluating:  57%|█████▋    | 171/300 [07:26<05:43,  2.66s/it]

Similarity Score: 0.0


Evaluating:  57%|█████▋    | 172/300 [07:30<06:40,  3.13s/it]

Similarity Score: 0.6643356643356644


Evaluating:  58%|█████▊    | 173/300 [07:33<06:10,  2.91s/it]

Similarity Score: 0.7430830039525692


Evaluating:  58%|█████▊    | 174/300 [07:36<06:44,  3.21s/it]

Similarity Score: 0.6024096385542169


Evaluating:  58%|█████▊    | 175/300 [07:40<06:35,  3.17s/it]

Similarity Score: 0.5645933014354066


Evaluating:  59%|█████▊    | 176/300 [07:41<05:18,  2.57s/it]

Similarity Score: 0.9929078014184397


Evaluating:  59%|█████▉    | 177/300 [07:43<04:56,  2.41s/it]

Similarity Score: 0.38848920863309355


Evaluating:  59%|█████▉    | 178/300 [07:46<05:31,  2.72s/it]

Similarity Score: 0.5397923875432526


Evaluating:  60%|█████▉    | 179/300 [07:48<05:00,  2.48s/it]

Similarity Score: 0.9134615384615384


Evaluating:  60%|██████    | 180/300 [07:50<04:22,  2.19s/it]

Similarity Score: 0.9461077844311377


Evaluating:  60%|██████    | 181/300 [07:51<03:43,  1.87s/it]

Similarity Score: 0.9906542056074766


Evaluating:  61%|██████    | 182/300 [07:56<05:42,  2.90s/it]

Similarity Score: 0.0


Evaluating:  61%|██████    | 183/300 [07:59<05:55,  3.04s/it]

Similarity Score: 0.8121827411167513


Evaluating:  61%|██████▏   | 184/300 [08:01<05:05,  2.63s/it]

Similarity Score: 0.847682119205298


Evaluating:  62%|██████▏   | 185/300 [08:02<04:18,  2.25s/it]

Similarity Score: 1.0


Evaluating:  62%|██████▏   | 186/300 [08:04<03:47,  1.99s/it]

Similarity Score: 0.8920863309352518


Evaluating:  62%|██████▏   | 187/300 [08:06<04:05,  2.17s/it]

Similarity Score: 0.8130841121495327


Evaluating:  63%|██████▎   | 188/300 [08:11<05:08,  2.76s/it]

Similarity Score: 0.7146974063400576


Evaluating:  63%|██████▎   | 189/300 [08:12<04:19,  2.33s/it]

Similarity Score: 0.8717948717948718


Evaluating:  63%|██████▎   | 190/300 [08:13<03:49,  2.09s/it]

Similarity Score: 0.7346938775510204


Evaluating:  64%|██████▎   | 191/300 [08:16<03:47,  2.09s/it]

Similarity Score: 0.9516129032258065


Evaluating:  64%|██████▍   | 192/300 [08:17<03:38,  2.03s/it]

Similarity Score: 0.602510460251046


Evaluating:  64%|██████▍   | 193/300 [08:20<04:04,  2.29s/it]

Similarity Score: 0.9624060150375939


Evaluating:  65%|██████▍   | 194/300 [08:24<04:34,  2.59s/it]

Similarity Score: 0.7177700348432056


Evaluating:  65%|██████▌   | 195/300 [08:26<04:35,  2.63s/it]

Similarity Score: 0.22887323943661972


Evaluating:  65%|██████▌   | 196/300 [08:28<04:06,  2.37s/it]

Similarity Score: 0.9941520467836257


Evaluating:  66%|██████▌   | 197/300 [08:30<04:00,  2.33s/it]

Similarity Score: 0.7550200803212851


Evaluating:  66%|██████▌   | 198/300 [08:32<03:25,  2.02s/it]

Similarity Score: 0.46332046332046334


Evaluating:  66%|██████▋   | 199/300 [08:33<02:59,  1.77s/it]

Similarity Score: 0.8496732026143791


Evaluating:  67%|██████▋   | 200/300 [08:35<03:06,  1.86s/it]

Similarity Score: 0.9217391304347826
Step 200:
Model accuracy on test set 80%: 56.50%
Model accuracy on test set 90%: 42.50%
Model accuracy on test set 95%: 35.00%


Evaluating:  67%|██████▋   | 201/300 [08:37<02:57,  1.80s/it]

Similarity Score: 0.9924812030075187


Evaluating:  67%|██████▋   | 202/300 [08:38<02:40,  1.64s/it]

Similarity Score: 0.8414634146341463


Evaluating:  68%|██████▊   | 203/300 [08:41<03:15,  2.01s/it]

Similarity Score: 0.4939759036144578


Evaluating:  68%|██████▊   | 204/300 [08:45<04:05,  2.56s/it]

Similarity Score: 0.47334200260078024


Evaluating:  68%|██████▊   | 205/300 [08:46<03:25,  2.16s/it]

Similarity Score: 0.9929078014184397


Evaluating:  69%|██████▊   | 206/300 [08:47<03:10,  2.03s/it]

Similarity Score: 0.9936305732484076


Evaluating:  69%|██████▉   | 207/300 [08:50<03:34,  2.31s/it]

Similarity Score: 0.35019455252918286


Evaluating:  69%|██████▉   | 208/300 [08:52<02:58,  1.94s/it]

Similarity Score: 0.8872180451127819


Evaluating:  70%|██████▉   | 209/300 [08:54<03:13,  2.13s/it]

Similarity Score: 0.9912280701754386


Evaluating:  70%|███████   | 210/300 [08:56<02:55,  1.95s/it]

Similarity Score: 0.9928057553956835


Evaluating:  70%|███████   | 211/300 [09:01<04:21,  2.94s/it]

Similarity Score: 0.0


Evaluating:  71%|███████   | 212/300 [09:03<04:09,  2.84s/it]

Similarity Score: 0.9015151515151515


Evaluating:  71%|███████   | 213/300 [09:05<03:38,  2.51s/it]

Similarity Score: 0.8427672955974843


Evaluating:  71%|███████▏  | 214/300 [09:07<03:14,  2.26s/it]

Similarity Score: 0.9943502824858758


Evaluating:  72%|███████▏  | 215/300 [09:08<02:36,  1.84s/it]

Similarity Score: 0.9876543209876543


Evaluating:  72%|███████▏  | 216/300 [09:09<02:18,  1.65s/it]

Similarity Score: 0.9915966386554622


Evaluating:  72%|███████▏  | 217/300 [09:11<02:15,  1.63s/it]

Similarity Score: 0.8791208791208791


Evaluating:  73%|███████▎  | 218/300 [09:12<02:05,  1.54s/it]

Similarity Score: 0.9921259842519685


Evaluating:  73%|███████▎  | 219/300 [09:14<02:08,  1.58s/it]

Similarity Score: 0.5461847389558233


Evaluating:  73%|███████▎  | 220/300 [09:17<02:52,  2.16s/it]

Similarity Score: 0.7741935483870968


Evaluating:  74%|███████▎  | 221/300 [09:20<03:09,  2.40s/it]

Similarity Score: 0.9052631578947369


Evaluating:  74%|███████▍  | 222/300 [09:23<03:22,  2.60s/it]

Similarity Score: 0.6572769953051644


Evaluating:  74%|███████▍  | 223/300 [09:25<02:59,  2.33s/it]

Similarity Score: 0.5074626865671642


Evaluating:  75%|███████▍  | 224/300 [09:26<02:30,  1.97s/it]

Similarity Score: 0.9863013698630136


Evaluating:  75%|███████▌  | 225/300 [09:29<02:42,  2.17s/it]

Similarity Score: 0.9321266968325792


Evaluating:  75%|███████▌  | 226/300 [09:30<02:35,  2.11s/it]

Similarity Score: 0.7359307359307359


Evaluating:  76%|███████▌  | 227/300 [09:32<02:29,  2.04s/it]

Similarity Score: 0.09421841541755889


Evaluating:  76%|███████▌  | 228/300 [09:34<02:17,  1.91s/it]

Similarity Score: 0.9932885906040269


Evaluating:  76%|███████▋  | 229/300 [09:38<03:08,  2.66s/it]

Similarity Score: 0.8997867803837953


Evaluating:  77%|███████▋  | 230/300 [09:42<03:22,  2.89s/it]

Similarity Score: 0.5074626865671642


Evaluating:  77%|███████▋  | 231/300 [09:43<02:40,  2.33s/it]

Similarity Score: 0.75


Evaluating:  77%|███████▋  | 232/300 [09:45<02:36,  2.31s/it]

Similarity Score: 0.8281938325991189


Evaluating:  78%|███████▊  | 233/300 [09:47<02:31,  2.26s/it]

Similarity Score: 0.7009646302250804


Evaluating:  78%|███████▊  | 234/300 [09:50<02:40,  2.43s/it]

Similarity Score: 0.7050847457627119


Evaluating:  78%|███████▊  | 235/300 [09:56<03:37,  3.35s/it]

Similarity Score: 0.0


Evaluating:  79%|███████▊  | 236/300 [09:58<03:11,  2.99s/it]

Similarity Score: 0.5173501577287066


Evaluating:  79%|███████▉  | 237/300 [09:59<02:44,  2.61s/it]

Similarity Score: 0.69


Evaluating:  79%|███████▉  | 238/300 [10:01<02:16,  2.21s/it]

Similarity Score: 0.8904109589041096


Evaluating:  80%|███████▉  | 239/300 [10:02<02:02,  2.01s/it]

Similarity Score: 0.9931972789115646


Evaluating:  80%|████████  | 240/300 [10:05<02:21,  2.35s/it]

Similarity Score: 0.7891566265060241


Evaluating:  80%|████████  | 241/300 [10:08<02:30,  2.55s/it]

Similarity Score: 0.8990228013029316


Evaluating:  81%|████████  | 242/300 [10:10<02:11,  2.27s/it]

Similarity Score: 0.9929078014184397


Evaluating:  81%|████████  | 243/300 [10:12<02:02,  2.14s/it]

Similarity Score: 0.9942196531791907


Evaluating:  81%|████████▏ | 244/300 [10:14<01:55,  2.06s/it]

Similarity Score: 0.9956709956709957


Evaluating:  82%|████████▏ | 245/300 [10:15<01:43,  1.89s/it]

Similarity Score: 0.9929078014184397


Evaluating:  82%|████████▏ | 246/300 [10:17<01:38,  1.83s/it]

Similarity Score: 0.9082969432314411


Evaluating:  82%|████████▏ | 247/300 [10:21<02:18,  2.62s/it]

Similarity Score: 0.5306122448979592


Evaluating:  83%|████████▎ | 248/300 [10:24<02:12,  2.54s/it]

Similarity Score: 0.7563636363636363


Evaluating:  83%|████████▎ | 249/300 [10:26<02:05,  2.47s/it]

Similarity Score: 0.7380952380952381


Evaluating:  83%|████████▎ | 250/300 [10:29<02:08,  2.58s/it]

Similarity Score: 0.7436619718309859
Step 250:
Model accuracy on test set 80%: 56.40%
Model accuracy on test set 90%: 42.00%
Model accuracy on test set 95%: 34.40%


Evaluating:  84%|████████▎ | 251/300 [10:31<01:55,  2.36s/it]

Similarity Score: 0.9940828402366864


Evaluating:  84%|████████▍ | 252/300 [10:32<01:39,  2.07s/it]

Similarity Score: 0.9885057471264368


Evaluating:  84%|████████▍ | 253/300 [10:35<01:47,  2.29s/it]

Similarity Score: 0.9466666666666667


Evaluating:  85%|████████▍ | 254/300 [10:37<01:44,  2.27s/it]

Similarity Score: 0.44680851063829785


Evaluating:  85%|████████▌ | 255/300 [10:38<01:27,  1.96s/it]

Similarity Score: 0.991869918699187


Evaluating:  85%|████████▌ | 256/300 [10:41<01:29,  2.03s/it]

Similarity Score: 0.8818181818181818


Evaluating:  86%|████████▌ | 257/300 [10:42<01:19,  1.86s/it]

Similarity Score: 0.8957055214723927


Evaluating:  86%|████████▌ | 258/300 [10:45<01:31,  2.17s/it]

Similarity Score: 0.9396825396825397


Evaluating:  86%|████████▋ | 259/300 [10:48<01:38,  2.39s/it]

Similarity Score: 0.935361216730038


Evaluating:  87%|████████▋ | 260/300 [10:50<01:29,  2.24s/it]

Similarity Score: 0.6966292134831461


Evaluating:  87%|████████▋ | 261/300 [10:52<01:26,  2.23s/it]

Similarity Score: 0.8905109489051095


Evaluating:  87%|████████▋ | 262/300 [10:56<01:44,  2.74s/it]

Similarity Score: 0.7909836065573771


Evaluating:  88%|████████▊ | 263/300 [10:58<01:31,  2.46s/it]

Similarity Score: 0.9938650306748467


Evaluating:  88%|████████▊ | 264/300 [11:01<01:41,  2.82s/it]

Similarity Score: 0.9523809523809523


Evaluating:  88%|████████▊ | 265/300 [11:04<01:36,  2.74s/it]

Similarity Score: 0.937984496124031


Evaluating:  89%|████████▊ | 266/300 [11:06<01:26,  2.54s/it]

Similarity Score: 0.8955223880597015


Evaluating:  89%|████████▉ | 267/300 [11:08<01:23,  2.54s/it]

Similarity Score: 0.8290909090909091


Evaluating:  89%|████████▉ | 268/300 [11:10<01:12,  2.26s/it]

Similarity Score: 0.7058823529411765


Evaluating:  90%|████████▉ | 269/300 [11:12<01:03,  2.05s/it]

Similarity Score: 0.7731958762886598


Evaluating:  90%|█████████ | 270/300 [11:14<01:04,  2.15s/it]

Similarity Score: 0.6909090909090909


Evaluating:  90%|█████████ | 271/300 [11:17<01:11,  2.47s/it]

Similarity Score: 0.6878306878306878


Evaluating:  91%|█████████ | 272/300 [11:20<01:11,  2.55s/it]

Similarity Score: 0.8230088495575221


Evaluating:  91%|█████████ | 273/300 [11:22<01:01,  2.30s/it]

Similarity Score: 0.9949748743718593


Evaluating:  91%|█████████▏| 274/300 [11:25<01:11,  2.74s/it]

Similarity Score: 0.7563451776649747


Evaluating:  92%|█████████▏| 275/300 [11:28<01:08,  2.73s/it]

Similarity Score: 0.9968051118210862


Evaluating:  92%|█████████▏| 276/300 [11:30<01:01,  2.55s/it]

Similarity Score: 0.9082969432314411


Evaluating:  92%|█████████▏| 277/300 [11:34<01:05,  2.84s/it]

Similarity Score: 0.4759036144578313


Evaluating:  93%|█████████▎| 278/300 [11:39<01:15,  3.41s/it]

Similarity Score: 0.767590618336887


Evaluating:  93%|█████████▎| 279/300 [11:41<01:03,  3.04s/it]

Similarity Score: 0.9032258064516129


Evaluating:  93%|█████████▎| 280/300 [11:44<01:04,  3.20s/it]

Similarity Score: 0.7209302325581395


Evaluating:  94%|█████████▎| 281/300 [11:46<00:50,  2.63s/it]

Similarity Score: 0.900990099009901


Evaluating:  94%|█████████▍| 282/300 [11:47<00:38,  2.15s/it]

Similarity Score: 0.8333333333333334


Evaluating:  94%|█████████▍| 283/300 [11:51<00:49,  2.93s/it]

Similarity Score: 0.8260869565217391


Evaluating:  95%|█████████▍| 284/300 [11:55<00:51,  3.21s/it]

Similarity Score: 0.8907103825136612


Evaluating:  95%|█████████▌| 285/300 [11:57<00:39,  2.63s/it]

Similarity Score: 0.98989898989899


Evaluating:  95%|█████████▌| 286/300 [11:59<00:36,  2.58s/it]

Similarity Score: 0.5700245700245701


Evaluating:  96%|█████████▌| 287/300 [12:01<00:29,  2.29s/it]

Similarity Score: 0.993103448275862


Evaluating:  96%|█████████▌| 288/300 [12:06<00:39,  3.31s/it]

Similarity Score: 0.0


Evaluating:  96%|█████████▋| 289/300 [12:08<00:31,  2.88s/it]

Similarity Score: 0.9950248756218906


Evaluating:  97%|█████████▋| 290/300 [12:12<00:31,  3.15s/it]

Similarity Score: 0.17796610169491525


Evaluating:  97%|█████████▋| 291/300 [12:13<00:22,  2.49s/it]

Similarity Score: 0.9846153846153847


Evaluating:  97%|█████████▋| 292/300 [12:16<00:21,  2.72s/it]

Similarity Score: 0.9726027397260274


Evaluating:  98%|█████████▊| 293/300 [12:19<00:18,  2.71s/it]

Similarity Score: 0.7697841726618705


Evaluating:  98%|█████████▊| 294/300 [12:20<00:14,  2.35s/it]

Similarity Score: 0.9937106918238994


Evaluating:  98%|█████████▊| 295/300 [12:22<00:10,  2.14s/it]

Similarity Score: 0.826530612244898


Evaluating:  99%|█████████▊| 296/300 [12:27<00:11,  2.85s/it]

Similarity Score: 0.0


Evaluating:  99%|█████████▉| 297/300 [12:31<00:09,  3.23s/it]

Similarity Score: 0.9009433962264151


Evaluating:  99%|█████████▉| 298/300 [12:33<00:06,  3.07s/it]

Similarity Score: 0.5681159420289855


Evaluating: 100%|█████████▉| 299/300 [12:36<00:03,  3.05s/it]

Similarity Score: 0.8048780487804879


Evaluating: 100%|██████████| 300/300 [12:38<00:00,  2.53s/it]

Similarity Score: 0.9066666666666666
Step 300:
Model accuracy on test set 80%: 58.00%
Model accuracy on test set 90%: 42.33%
Model accuracy on test set 95%: 33.00%
Final Model accuracy on test set 80%: 58.00%
Final Model accuracy on test set 90%: 42.33%
Final Model accuracy on test set 95%: 33.00%





# Save and Loading

This is just for saving the lora adapters...

In [None]:
model.save_pretrained("8B_instruct_58") # Local saving
tokenizer.save_pretrained("8B_instruct_58")
model.push_to_hub("PlatoisHere/8B_instruct_58", token = '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') # Online saving
tokenizer.push_to_hub("PlatoisHere/8B_instruct_58", token = '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') # Online saving

README.md:   0%|          | 0.00/610 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/2.68G [00:00<?, ?B/s]

Saved model to https://huggingface.co/PlatoisHere/8B_instruct_58


...and loading the adapters

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "PlatoisHere/8B_58", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.68G [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


This is to merge and load the model:

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")