In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [5]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-2.17.2-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.2 (from mlflow)
  Using cached mlflow_skinny-2.17.2-py3-none-any.whl.metadata (30 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.2->mlflow)
  Using cached databricks_sdk-0.37.0-py3-none-any.whl.metadata (38 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==2.17.2->mlflow)
  Using cached opentelemetry_sdk-1.28.1-py3-none-any.whl.metadata (1.5 kB)
Using cached mlflow-2.17.2-py3-none-any.whl (26.7 MB)
Using cached mlflow_skinny-2.17.2-py3-none-any.whl (5.7 MB)
Using cached databricks_sdk-0.37.0-py3-none-any.whl (571 kB)
Using cached opentelemetry_sdk-1.28.1-py3-none-any.whl (118 kB)
Installing collected packages: databricks-sdk, opentelemetry-sdk, mlflow-skinny, mlflow
Successfully installed databricks-sdk-0.37.0 mlflow-2.17.2 mlflow-skinny-2.17.2 opentelemetry-sdk-1.28.1


In [6]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("unsloth_Llama_3.1_8B_Regression_Univariate")
mlflow.start_run(run_name="unsloth_Llama_3.1_8B_Regression_Univariate")

MlflowException: API request to http://localhost:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=unsloth_Llama_3.1_8B_Regression_Univariate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000019037BDEB70>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 #2048 # Choose any! We auto support RoPE Scaling internally!
dtype = getattr(torch, "float16") # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:
model_id="unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    # max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


# Add LoRA adapters so we only need to update 1 to 10% of all parameters!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj","lm_head"],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


In [None]:
model

In [None]:
tokenizer

In [None]:
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenizer

# Data Preparation

In [None]:
from datasets import load_dataset


In [None]:
dataset = load_dataset('csv', data_files='/content/Regression_Univariate_train.csv', split='train')
dataset

In [None]:
# Split the dataset into train and valid sets
train_valid_test_split = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train + 20% temp
valid_test_split = train_valid_test_split['test'].train_test_split(test_size=0.5, seed=42)  # 10% each for validation and test


In [None]:
train_dataset = train_valid_test_split['train']   # 80% train
valid_dataset = valid_test_split['train']         # 10% validation
test_dataset = valid_test_split['test']           # 10% test

In [None]:
train_dataset

In [None]:
valid_dataset

In [None]:
EOS_TOKEN = tokenizer.eos_token

train_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.
The best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.

### DESCRIPTION:
{}

### RESPONSE:
{}"""

def formatting_prompts_func(examples):
    inputs       = examples["series_description"]
    outputs      = examples["algorithm"]
    texts = []
    for input, output in zip( inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = train_prompt.format( input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [None]:
train_dataset = train_dataset.map(formatting_prompts_func, batched = True)

In [None]:
train_dataset

In [None]:
train_dataset['text'][0]

In [None]:
valid_dataset = valid_dataset.map(formatting_prompts_func, batched = True)
valid_dataset

In [None]:
valid_dataset['text'][0]

In [None]:
test_dataset = test_dataset.map(formatting_prompts_func, batched = True)
test_dataset

In [None]:
test_dataset['text'][0]

# Model Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
training_arguments= TrainingArguments(
        num_train_epochs=5,
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 32//4,
        gradient_checkpointing=True,
        warmup_steps = 5,
        max_steps = -1, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "./Llama_outputs",
        eval_strategy="steps", #epoch
        save_strategy="epoch",
    )

In [None]:
from trl import  DataCollatorForCompletionOnlyLM

In [None]:
instruction_template="DESCRIPTION:"
response_template = "RESPONSE:"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_arguments,
    data_collator =  DataCollatorForCompletionOnlyLM(instruction_template=instruction_template,
                                                     response_template=response_template,
                                                     tokenizer=tokenizer,mlm=False),

)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
import time

In [None]:
start= time.time()
trainer_stats = trainer.train()
print((time.time()-start)/60)

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# inference

In [None]:
test_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.
The best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.

### DESCRIPTION:
{}

### RESPONSE:
"""

def formatting_test_prompts_func(examples):
    inputs       = examples["series_description"]
    # outputs      = examples["algorithm"]
    texts = []
    for input, output in zip( inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = train_prompt.format( input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [None]:
# test_dataset= load_dataset('csv', data_files='/content/test.csv')
# test_dataset=test_dataset ['train']

In [None]:
test_dataset

In [None]:
test_dataset['text'][0]

In [None]:
test_dataset['algorithm'][0]

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[test_dataset['text'][0]], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
test_dataset['algorithm'][1]

In [None]:
inputs = tokenizer(
[test_dataset['text'][1]], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
test_responses=[]
# get all test data inference result
for test_prompt in test_dataset['text']:
  inputs= tokenizer(
  [test_prompt], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 10, use_cache = True)
  test_responses.append(tokenizer.batch_decode(outputs))


In [None]:
test_responses

In [None]:
# Convert the dataset to a Pandas DataFrame
df = test_dataset.to_pandas()

In [None]:
df['model_responses']= test_responses

In [None]:
# Save the DataFrame as a CSV file
df.to_csv('test_model_result_unsloth.csv', index=False)

In [None]:
df

In [None]:
predictions=[]
for response in test_responses:
  predictions.append( response[0].split('\n\n### RESPONSE:\n')[1].split('</s>')[0].strip())

predictions

In [None]:
len(predictions)

In [None]:
actual_data= df['algorithm']
len(actual_data)

In [None]:
actual_data

In [None]:
accuracy = sum(1 for true, pred in zip(actual_data, predictions) if true == pred) / len(actual_data)
accuracy

# save tuned model
To save the final model as LoRA adapters

In [None]:
# Local saving
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


In [None]:
secret_hf = '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'
!git config --global credential.helper store
!huggingface-cli login --token $secret_hf --add-to-git-credential

In [None]:
# Online saving on HF
from huggingface_hub import login

new_model_adabtor= "ZiadWael/unsloth-Llama3.1-tuned"
login(token="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")

# Push the model and tokenizer to the Hugging Face hub
model.push_to_hub(new_model_adabtor)
tokenizer.push_to_hub(new_model_adabtor)

In [None]:
# Online saving on HF
model.push_to_hub("unsloth-Llama-tuned", use_auth_token=secret_hf)
tokenizer.push_to_hub("unsloth-Llama-tuned", use_auth_token=secret_hf)

In [None]:
# Save and Merge to 4bit
model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit_forced",token ="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")
model.push_to_hub_merged("model", tokenizer, save_method = "merged_4bit_forced", token = "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")

In [None]:
# Save just LoRA adapters
model.save_pretrained_merged("model", tokenizer, save_method = "lora",token= "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")
model.push_to_hub_merged("model", tokenizer, save_method = "lora", token = "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")

#GGUF / llama.cpp Conversion

Use **save_pretrained_ggu**f for local saving and **push_to_hub_ggu**f for uploading to HF.

Some supported quant methods:

- q8_0 - Fast conversion. High resource use, but generally acceptable.
- q4_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
- q5_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
model.push_to_hub_gguf("unsloth-llama-ft-gguf", tokenizer, quantization_method = "q4_k_m", token="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")

In [None]:
model.push_to_hub_merged("model", tokenizer, save_method = "merged_4bit_forced", token = "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")