In [1]:
%%capture

# Installs Unsloth, Xformers (Flash Attention) and all other packages!

!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
%%capture
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [3]:
%%capture
!pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [4]:
from unsloth import FastLanguageModel

import torch

max_seq_length = None #2048 # Choose any! We auto support RoPE Scaling internally!

dtype =  getattr(torch, "float") # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
model_id="unsloth/gemma-2-2b-bnb-4bit"

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(

    model_name = model_id,

    max_seq_length = max_seq_length,

    dtype = dtype,

    load_in_4bit = load_in_4bit,

)


==((====))==  Unsloth 2024.11.7: Fast Gemma2 patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [7]:
model = FastLanguageModel.get_peft_model(

    model,

    r = 64, #16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],

    lora_alpha = 16,

    lora_dropout = 0.1, # Supports any, but = 0 is optimized

    bias = "none",    # Supports any, but = "none" is optimized

    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context

    random_state = 3407,

    use_rslora = False,  # We support rank stabilized LoRA

    loftq_config = None, # And LoftQ

)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.7 patched 26 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [8]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
         

In [9]:
tokenizer

GemmaTokenizerFast(name_or_path='unsloth/gemma-2-2b-bnb-4bit', vocab_size=256000, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_word=False, normal

In [10]:
tokenizer.padding_side = 'right'

tokenizer.add_eos_token = True

tokenizer.pad_token = tokenizer.eos_token

In [11]:
tokenizer

GemmaTokenizerFast(name_or_path='unsloth/gemma-2-2b-bnb-4bit', vocab_size=256000, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<eos>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_word=False, normal

In [12]:
from datasets import load_dataset

# Load the original dataset
dataset = load_dataset('csv', data_files='/kaggle/input/mutlivariate-data/Regression_MultiVariate_train.csv', split='train')

# Step 1: Initial split (70% training, 30% temporary)
train_temp_split = dataset.train_test_split(test_size=0.3)
train_dataset = train_temp_split['train']
temp_dataset = train_temp_split['test']

# Step 2: Split the temporary set into 15% validation and 15% test
valid_test_split = temp_dataset.train_test_split(test_size=0.5)
valid_dataset = valid_test_split['train']
test_dataset = valid_test_split['test']

# Confirm the sizes of each split
print(f"Training Set Size: {len(train_dataset)}")
print(f"Validation Set Size: {len(valid_dataset)}")
print(f"Test Set Size: {len(test_dataset)}")


Generating train split: 0 examples [00:00, ? examples/s]

Training Set Size: 510
Validation Set Size: 109
Test Set Size: 110


In [13]:
train_dataset

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters'],
    num_rows: 510
})

In [14]:
valid_dataset

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters'],
    num_rows: 109
})

In [15]:
test_dataset

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters'],
    num_rows: 110
})

In [16]:
EOS_TOKEN = tokenizer.eos_token

train_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.
The best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.

### DESCRIPTION:
{}

### RESPONSE:
{}"""

def formatting_prompts_func(examples):
    inputs       = examples["series_description"]
    outputs      = examples["algorithm"]
    texts = []
    for input, output in zip( inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = train_prompt.format( input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }



In [17]:
train_dataset = train_dataset.map(formatting_prompts_func, batched = True)
train_dataset

Map:   0%|          | 0/510 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 510
})

In [18]:
train_dataset['text'][0]

'Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\nThe best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.\n\n### DESCRIPTION:\nA multivariate regression time-series dataset consists of 7110 instances and 13 features. The percentage of numerical features to categorical features in the dataset is 0.71  The dataset has a sampling rate of 60.0 minutes. The percentage of missing values in the target column is 0.0%. he missing values percentages for numerical features range from 0% to 0% with mean 0.0%, and standard deviation 0.0%.Similarly, the missing values percentages for categorical features range from 0% to 0% with mean 0.0%, and standard deviation 0.0%. The target series has a minimum 

In [19]:
valid_dataset = valid_dataset.map(formatting_prompts_func, batched = True)
valid_dataset

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 109
})

In [20]:
valid_dataset['text'][0]

'Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\nThe best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.\n\n### DESCRIPTION:\nA multivariate regression time-series dataset consists of 7012 instances and 12 features. The percentage of numerical features to categorical features in the dataset is 4.5  The dataset has a sampling rate of 60.0 minutes. The percentage of missing values in the target column is 0.66%. he missing values percentages for numerical features range from 100% to 9500% with mean 3644.44%, and standard deviation 3737.68%.Similarly, the missing values percentages for categorical features range from 200% to 11100% with mean 5650.0%, and standard deviation 7707.46%. The ta

In [21]:
test_dataset = test_dataset.map(formatting_prompts_func, batched = True)
test_dataset

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 110
})

In [22]:
test_dataset['text'][0]

'Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\nThe best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.\n\n### DESCRIPTION:\nA multivariate regression time-series dataset consists of 6671 instances and 17 features. The percentage of numerical features to categorical features in the dataset is 15.0  The dataset has a sampling rate of 60.0 minutes. The percentage of missing values in the target column is 0.97%. he missing values percentages for numerical features range from 6500% to 226000% with mean 21153.33%, and standard deviation 56669.18%. The target series has a minimum value 16.4, maximum value 37.2, median 24.8, mean 25.9710463474649, and average standard deviation of 0.18072314

In [23]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [24]:
training_arguments= TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 10, # Set this for 1 full training run.
        max_steps = -1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc

    )

In [25]:
from trl import  DataCollatorForCompletionOnlyLM

In [26]:
instruction_template="DESCRIPTION:"

response_template = "RESPONSE:"



trainer = SFTTrainer(

    model = model,

    tokenizer = tokenizer,

    train_dataset=train_dataset,

    eval_dataset=valid_dataset,

    dataset_text_field = "text",

    max_seq_length = max_seq_length,

    dataset_num_proc = 2,

    packing = False, # Can make training 5x faster for short sequences.

    args = training_arguments,

    data_collator =  DataCollatorForCompletionOnlyLM(instruction_template=instruction_template,

                                                     response_template=response_template,

                                                     tokenizer=tokenizer,mlm=False),



)

Map (num_proc=2):   0%|          | 0/510 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/109 [00:00<?, ? examples/s]

In [27]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
3.9 GB of memory reserved.


In [28]:
import time
start= time.time()
trainer_stats = trainer.train()
print((time.time()-start)/60)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 510 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 630
 "-____-"     Number of trainable parameters = 83,066,880


Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


122.64176133871078


In [29]:
#@title Show final memory and time stats

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

used_memory_for_lora = round(used_memory - start_gpu_memory, 3)

used_percentage = round(used_memory         /max_memory*100, 3)

lora_percentage = round(used_memory_for_lora/max_memory*100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")

print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")

print(f"Peak reserved memory = {used_memory} GB.")

print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")

print(f"Peak reserved memory % of max memory = {used_percentage} %.")

print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

7355.7775 seconds used for training.
122.6 minutes used for training.
Peak reserved memory = 9.328 GB.
Peak reserved memory for training = 5.428 GB.
Peak reserved memory % of max memory = 63.279 %.
Peak reserved memory for training % of max memory = 36.822 %.


In [30]:
test_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.

The best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.



### DESCRIPTION:

{}



### RESPONSE:

"""

def formatting_test_prompts_func(examples):
    inputs = examples["series_description"]
    texts = []

    for input in inputs:
        text = test_prompt.format(input) + EOS_TOKEN
        texts.append(text)

    return { "text": texts }


In [31]:
test_dataset = test_dataset.map(formatting_test_prompts_func, batched = True)
test_dataset

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 110
})

In [32]:
# alpaca_prompt = Copied from above

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[test_dataset['text'][0]], return_tensors = "pt").to("cuda")



outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

tokenizer.batch_decode(outputs)

['<bos>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate regression time-series dataset consists of 6671 instances and 17 features. The percentage of numerical features to categorical features in the dataset is 15.0  The dataset has a sampling rate of 60.0 minutes. The percentage of missing values in the target column is 0.97%. he missing values percentages for numerical features range from 6500% to 226000% with mean 21153.33%, and standard deviation 56669.18%. The target series has a minimum value 16.4, maximum value 37.2, median 24.8, mean 25.9710463474649, and average standard deviation of 0.180723143

In [33]:
test_dataset['algorithm'][1]

'XGBoostRegressor'

In [34]:
inputs = tokenizer(

[test_dataset['text'][1]], return_tensors = "pt").to("cuda")



outputs = model.generate(**inputs, max_new_tokens = 5, use_cache = True)

tokenizer.batch_decode(outputs)

['<bos>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate regression time-series dataset consists of 1456 instances and 6 features. All features in the dataset are numerical.  The dataset has a sampling rate of 1440.0 minutes. The percentage of missing values in the target column is 0.0%. he missing values percentages for numerical features range from 0% to 0% with mean 0.0%, and standard deviation 0.0%. The target series has a minimum value 19.809999465942383, maximum value 41.43999862670898, median 33.05500030517578, mean 32.50136136283189, and average standard deviation of 0.10642739795940268 for the 1

In [35]:
test_responses=[]

# get all test data inference result

for test_prompt in test_dataset['text']:

  inputs= tokenizer(

  [test_prompt], return_tensors = "pt").to("cuda")



  outputs = model.generate(**inputs, max_new_tokens = 10, use_cache = True)

  test_responses.append(tokenizer.batch_decode(outputs))


In [36]:
test_responses

[['<bos>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate regression time-series dataset consists of 6671 instances and 17 features. The percentage of numerical features to categorical features in the dataset is 15.0  The dataset has a sampling rate of 60.0 minutes. The percentage of missing values in the target column is 0.97%. he missing values percentages for numerical features range from 6500% to 226000% with mean 21153.33%, and standard deviation 56669.18%. The target series has a minimum value 16.4, maximum value 37.2, median 24.8, mean 25.9710463474649, and average standard deviation of 0.18072314

In [37]:
# Convert the dataset to a Pandas DataFrame

df = test_dataset.to_pandas()


In [38]:
df['model_responses']= test_responses

In [39]:
# Save the DataFrame as a CSV file

df.to_csv('test_model_result_unsloth.csv', index=False)

In [40]:
df

Unnamed: 0,dataset_name,series_description,algorithm,hyperparameters,text,model_responses
0,1031-11-1-1-5.csv,A multivariate regression time-series dataset ...,RandomForestRegressor,"{'max_depth': None, 'min_samples_split': 2, 'n...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
1,1030-207.csv,A multivariate regression time-series dataset ...,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
2,1031-18-2-1-1.csv,A multivariate regression time-series dataset ...,ExtraTreesRegressor,"{'bootstrap': True, 'criterion': 'friedman_mse...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
3,1016-19-3-1.csv,A multivariate regression time-series dataset ...,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
4,1031-50-2-1-2.csv,A multivariate regression time-series dataset ...,RandomForestRegressor,"{'max_depth': None, 'min_samples_split': 2, 'n...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
...,...,...,...,...,...,...
105,1016-4-3-3.csv,A multivariate regression time-series dataset ...,LassoRegressor,"{'alpha': 1.3129127258170419, 'random_state': ...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
106,1031-16-2-1-6.csv,A multivariate regression time-series dataset ...,RandomForestRegressor,"{'max_depth': None, 'min_samples_split': 2, 'n...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
107,1020-50-1.csv,A multivariate regression time-series dataset ...,AdaboostRegressor,"{'learning_rate': 0.01, 'n_estimators': 200, '...",Below is a description for a time series data....,[<bos>Below is a description for a time series...
108,1016-5-3-1.csv,A multivariate regression time-series dataset ...,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",Below is a description for a time series data....,[<bos>Below is a description for a time series...


In [51]:
from difflib import get_close_matches
from sklearn.metrics import precision_score, recall_score, f1_score

# Updated valid algorithms for both Classifiers and Regressors
valid_algorithms = [
    'GaussianProcessRegressor',
    'ElasticNetRegressor',
    'LassoRegressor',
    'AdaboostRegressor',
    'XGBoostRegressor',
    'RandomForestRegressor',
    'SVR',
    'LightgbmRegressor'
]

predictions = []

for response in test_responses:
    try:
        # Check if the response contains '### RESPONSE:'
        if '### RESPONSE:' in response[0]:
            response_text = response[0].split('### RESPONSE:')[1].strip()
            response_text = response_text.replace('</s>', '').strip()

            if not response_text:  # If the response text is empty after cleaning
                print("Empty response_text detected.")
                predicted_algo = ""  # Assign empty string or handle appropriately
            else:
                # Get the first word as the predicted algorithm
                predicted_algo = response_text.split()[0]

                # Use get_close_matches to find the closest valid algorithm
                #predicted_algo = get_close_matches(predicted_algo, valid_algorithms, n=1, cutoff=0.3)

                # Debugging: Check if a match was found
                if predicted_algo:
                    # print("Closest match found:", predicted_algo[0])
                    predicted_algo = predicted_algo[0]
                else:
                    print("No match found, using empty string")
                    predicted_algo = ""
        else:
            predicted_algo = ""  # No response to process

        predictions.append(predicted_algo)
    except Exception as e:
        print(f"Error parsing response: {response}, Error: {e}")
        predictions.append("")

# Ensure that actual_data matches the format of predictions
actual_data = df['algorithm'].tolist()

# Compute evaluation metrics
accuracy = sum(1 for true, pred in zip(actual_data, predictions) if true == pred) / len(actual_data)
recall = recall_score(actual_data, predictions, average='weighted', zero_division=0)
f1 = f1_score(actual_data, predictions, average='weighted')

print("\nAccuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)

# Display first 5 for reference
print("\nActual Data (First 5):", actual_data[:5])
print("Predictions (First 5):", predictions[:5])



Accuracy: 0.0
Recall: 0.0
F1 Score: 0.0

Actual Data (First 5): ['RandomForestRegressor', 'XGBoostRegressor', 'ExtraTreesRegressor', 'XGBoostRegressor', 'RandomForestRegressor']
Predictions (First 5): ['<', '<', '<', '<', '<']


In [52]:
accuracy = sum(1 for true, pred in zip(actual_data, predictions) if true == pred) / len(actual_data)
accuracy

0.0

In [53]:
# Local saving
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [54]:
!zip -r lora_model.zip lora_model


  pid, fd = os.forkpty()


  adding: lora_model/ (stored 0%)
  adding: lora_model/tokenizer.json (deflated 84%)
  adding: lora_model/tokenizer.model (deflated 51%)
  adding: lora_model/tokenizer_config.json (deflated 96%)
  adding: lora_model/README.md (deflated 66%)
  adding: lora_model/adapter_config.json (deflated 54%)
  adding: lora_model/adapter_model.safetensors (deflated 58%)
  adding: lora_model/special_tokens_map.json (deflated 70%)


Tokenizer saved as tokenizer.zip


In [44]:
%%capture
!pip install transformers huggingface_hub



In [45]:

from huggingface_hub import login

login(token="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")

# Online saving on HF

new_model_adabtor= "unsloth-Gemma2-2b-tuned_model"

model.push_to_hub('model.zip')

tokenizer.push_to_hub('tokenizerx.zip')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


README.md:   0%|          | 0.00/577 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Saved model to https://huggingface.co/model.zip


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]