In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
from unsloth import FastLanguageModel ,is_bfloat16_supported
from datasets import load_dataset
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
import time
from trl import  DataCollatorForCompletionOnlyLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ZiadWael/unsloth-Llama3.1-tuned",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/778M [00:00<?, ?B/s]

Unsloth 2024.11.7 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
test_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.
The best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.

### DESCRIPTION:
{}

### RESPONSE:"""


def formatting_test_prompts_func(examples):
    global tokenizer

    inputs = examples["series_description"]
    texts = []
    for input in  inputs:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = test_prompt.format( input)
        texts.append(text)
    return { "text" : texts }

In [7]:
#please put the dic of test data , i try with train data just for test the running code
dataset = load_dataset('csv', data_files="/kaggle/input/regression-univariate-train/Regression_Univariate_train.csv")


In [8]:
test_dataset = dataset['train']
test_dataset = test_dataset.map(formatting_test_prompts_func, batched = True)
test_dataset

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 828
})

In [9]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[test_dataset['text'][0]], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\nThe best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.\n\n### DESCRIPTION:\nA univariate time-series dataset  consists of 648 samples with a missing values percentage of 0.0% imputed using FBProphet model and 0.0% detected outliers. The target series has a sampling rate of 480 minutes, minimum value of -1.281900892748634, maximum value of 1.2575716095378922, median value of -0.840998218756287, mean value of -0.3999772916658776, and average standard deviation of 0.22521469337610456 for the 10 percentiles. The series is detected as non-stationary using dickey fuller testand it turns into a stationary series using first o

In [10]:
inputs = tokenizer(
[test_dataset['text'][1]], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\nThe best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.\n\n### DESCRIPTION:\nA univariate time-series dataset  consists of 5760 samples with a missing values percentage of 0.0% imputed using FBProphet model and 4.826388888888889% detected outliers. The target series has a sampling rate of 30 minutes, minimum value of -1.3327249393311884, maximum value of 1.3290006441545716, median value of -0.837102440111593, mean value of -0.5832298857283066, and average standard deviation of 0.19129701117888212 for the 10 percentiles. The series is detected as stationary using dickey fuller test.The series has 9 significant lags obser

In [11]:
test_responses=[]
# get all test data inference result
for test_prompt in test_dataset['text']:
  inputs= tokenizer(
  [test_prompt], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 5, use_cache = True)
  test_responses.append(tokenizer.batch_decode(outputs))


In [12]:
# Convert the dataset to a Pandas DataFrame
df = test_dataset.to_pandas()
df['model_responses']= test_responses
df.to_csv('test_model_result_unsloth.csv', index=False)

In [13]:
predictions = []

for response in test_responses:
    try:
        prediction = response[0].split('\n\n### RESPONSE:')[1].split('</s>')[0].strip()        
        clean_prediction = prediction.split('<|eot_id|>')[0].strip()        
        predictions.append(clean_prediction)
    except IndexError:
        predictions.append("Invalid response")

predictions[:5]

['SVR',
 'GaussianProcessRegressor',
 'ElasticNetRegressor',
 'ElasticNetRegressor',
 'GaussianProcessRegressor']

In [14]:
actual_data= df['algorithm']
len(actual_data)

828

In [15]:
actual_data.head()

0         XGBoostRegressor
1      ExtraTreesRegressor
2    RandomForestRegressor
3           LassoRegressor
4    RandomForestRegressor
Name: algorithm, dtype: object

In [18]:
accuracy = sum(1 for true, pred in zip(actual_data, predictions) if true.lower() == pred.lower()) / len(actual_data)
accuracy

0.21835748792270532