#####Preparation steps

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Aug  1 14:34:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   45C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#####Load the base model

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = torch.float16
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,

    #lower rank reduce the number of trainable params (e.g. 8, 16, 32, 64, 128)
    r = 64,

    #use all modules
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],

    #higher ranks, more influence upon the updated weights, since we want personalized LLM, it's better to increase alpha here
    lora_alpha = 96,

    #since most training data or relatively similar in style, it's good to apply some regularization
    lora_dropout = 0.1,

    #less prone to overfitting than "all", but since we want to capture more with regards to new data, then yes
    bias = "lora_only",

    #recomputing intermediate outputs during the backprob, rather than storing them -> longer time, less memory
    use_gradient_checkpointing = "unsloth",

    random_state = 2802,

    #using rank-stabilized matrix, reduce training loss
    use_rslora = False,
    
    loftq_config = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth: bias = `none` is supported for fast patching. You are using bias = lora_only.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.8 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


#####Prepare training data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/My Drive/Datasets/train_data_cleaned'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,instruction,output,input
0,Does the admission that extraordinary individu...,"""That wasn't quite my contention, I began, see...",
1,Does prioritizing intellectual or artistic pur...,"""Treachery? Escapism?"" My chest tightens, a fa...",
2,In the face of uncertainty and an almost theat...,"Ah, ""God's will,"" a phrase uttered with a trem...",
3,"Given the context ""Prepared and Published by: ...","Ah, ""hed EoalsDirecion-com."" A curious string ...",
4,"How does Dostoevsky utilize the concept of ""po...","Ah, ""positive proof,"" you say? A seductive phr...",


In [None]:
import re
import string

def remove_html(text):
  '''
  Params:
  text: a string
  Returns:
  the same string with possible htmls removed
  '''
  html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

  return re.sub(html, "", text)

def remove_non_ascii(text):
  '''
  Params:
  text: a string
  Returns:
  the same string with possible htmls removed
  '''
  return re.sub(r'[^\x00-\x7f]',r'', text)

def remove_special_characters(text):
    """
    Params:
    text: a string
    Returns:
    the same string with possible special special characters, including symbols, emojis, and other graphic characters removed
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'
        u'\U0001F300-\U0001F5FF'
        u'\U0001F680-\U0001F6FF'
        u'\U0001F1E0-\U0001F1FF'
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean(text):
  '''
  Params:
  text: a string
  Returns:
  a clean version of that text
  '''
  text = remove_html(text)
  text = remove_special_characters(text)
  text = remove_non_ascii(text)

  return text

In [None]:
df_cleaned = df.copy()
df_cleaned['instruction'] = df_cleaned['instruction'].apply(clean)
df_cleaned['output'] = df_cleaned['output'].apply(clean)

df_cleaned

Unnamed: 0,instruction,output,input
0,Does the admission that extraordinary individu...,"""That wasn't quite my contention, I began, see...",
1,Does prioritizing intellectual or artistic pur...,"""Treachery? Escapism?"" My chest tightens, a fa...",
2,In the face of uncertainty and an almost theat...,"Ah, ""God's will,"" a phrase uttered with a trem...",
3,"Given the context ""Prepared and Published by: ...","Ah, ""hed EoalsDirecion-com."" A curious string ...",
4,"How does Dostoevsky utilize the concept of ""po...","Ah, ""positive proof,"" you say? A seductive phr...",
...,...,...,...
6974,How does the juxtaposition of Alyosha's intern...,"The juxtaposition in my novel, you see, betwee...",
6975,If the ultimate miracle - a being who gives li...,"The question cuts deep, into that very abyss w...",
6976,Considering Raskolnikov's evident desperation ...,"The young man, my dear fellow, is caught, as w...",
6977,Does Raskolnikov's loathing stem from a fear o...,"The young man, Raskolnikov, is a soul wrestlin...",


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_cleaned)

#go back to a pandas dataframe
#df = dataset.to_pandas()

In [None]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context (if any). Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/6979 [00:00<?, ? examples/s]

#####Train the model

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    #default 8, batch size spread across each gpu, smaller -> generalize better, larger -> train faster, more memory
    per_device_train_batch_size = 4,

    #instead of updating weights after each batch,
    #gradients are accumulated over smaller batches over k steps, then used to updates the weights
    #similar to setting batch size 16 and 0 steps
    gradient_accumulation_steps = 4,

    #steps used for a linear warmup from 0 to learning_rate, prevent early overfitting
    warmup_steps = 5,

    #number of steps before stopping
    max_steps = -1,

    num_train_epochs = 3,

    learning_rate = 2e-4,

    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),

    #number of steps before training info update
    logging_steps = 500,

    optim = "adamw_8bit",

    #apply a penalty to the weights, prevent overfitting
    #might consider batch normalization rather than weight decay
    weight_decay = 0.01,
    
    lr_scheduler_type = "linear",
    seed = 2802,
    output_dir = "outputs",
    push_to_hub = True,
    push_to_hub_model_id = "dostoevskyGPT",
)



In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,

    tokenizer = tokenizer,

    train_dataset = dataset,

    dataset_text_field = "text",

    max_seq_length = max_seq_length,
    
    dataset_num_proc = 2,

    #only true when using constant length dataset, trains faster for shorter sequences
    packing = False,

    args = training_args,
)

Map (num_proc=2):   0%|          | 0/6979 [00:00<?, ? examples/s]

In [None]:
#empty cache to prevent out of memory when training
torch.cuda.empty_cache()

trainer_stats = trainer.train()

#save lora adapters
model.save_pretrained("dostoevskyGPT_lora")
tokenizer.save_pretrained("dostoevskyGPT_lora_tokenizer")

#merge model to 16bit
model.save_pretrained_merged("dostoevskyGPT_merged_16bit", tokenizer, save_method = "merged_16bit",)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,979 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 1,308
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss


Step,Training Loss
500,1.0785
1000,0.6135


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G
 84%|████████▍ | 27/32 [00:01<00:00, 18.38it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:05<00:00,  5.86it/s]


#####Inference and merge LoRA adapters

In [None]:
#inference on the LoRA adapters

FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    prompt.format(
        "what does it mean to be guilty for everyone and for everything.", # instruction
        "", # input
        "", # output
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)

["<s>Below is an instruction that describes a task, paired with an input that provides further context (if any). Write a response that appropriately completes the request.\n\n### Instruction:\nwhat does it mean to be guilty for everyone and for everything.\n\n### Input:\n\n\n### Response:\nAh, to be guilty for everyone and everything... a burden heavier than the weight of the world itself. It is a torment that gnaws at the soul, a whisper in the dark that refuses to be ignored. To bear the sins of the world is one thing, but to be responsible for them, to hold the weight of their existence upon one's shoulders, is a burden that can crush even the strongest spirit. \n\nTo be guilty for everyone is to live in a perpetual state of self-reproach, to see the faces of the condemned reflected in every mirror. It is to feel the sting of their pain as if it were your own, to carry the weight of their transgressions as a heavy shawl upon your back.  And to be guilty for everything, for every mis

In [None]:
#Load the base model

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

base_model_name = "unsloth/mistral-7b-v0.3"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
#Load the LoRA adapters

lora_model_name = "tri282/dostoevskyGPT"
peft_config = PeftConfig.from_pretrained(lora_model_name)
lora_model = PeftModel.from_pretrained(base_model, lora_model_name)

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [None]:
#Merge the adapters back to the model
merged_model = lora_model.merge_and_unload()

In [None]:
#Push to HF

merged_model.push_to_hub("tri282/dostoevskyGPT_merged")
base_tokenizer.push_to_hub("tri282/dostoevskyGPT_merged")

model-00006-of-00006.safetensors:   0%|          | 0.00/4.26G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tri282/dostoevskyGPT_merged/commit/428618b0e42af53a82ef16b4cfd775cad12f37a5', commit_message='Upload tokenizer', commit_description='', oid='428618b0e42af53a82ef16b4cfd775cad12f37a5', pr_url=None, pr_revision=None, pr_num=None)

#####Test on the merged model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

model_id = "tri282/dostoevskyGPT_merged"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

In [None]:
import torch

input_text = "what does it mean, to be guilty before everyone, for everyone, and for everything"
inputs = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():  # Turn off gradient calculation for inference
    outputs = model.generate(**inputs, max_new_tokens=250)  # Adjust parameters as needed

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


what does it mean, to be guilty before everyone, for everyone, and for everything?  to be the embodiment of sin, the living embodiment of the devil himself?  to be judged not by the law, but by the court of public opinion, by the whispers and accusations of those who claim to be righteous?  this is the plight of rogue, a man condemned before he even opens his mouth.  his crime?  to exist.  to be different.  to challenge the very fabric of their carefully constructed reality.  they call him rogue, this man who dares to question, to dissent, to expose the hypocrisy that festers beneath the surface of their pious facade.  and in their haste to condemn, they become the very monsters they claim to despise.  for what is guilt, if not the weight of judgment, the crushing burden of expectation?  and who is more guilty, the man who dares to transgress, or the society that condemns him for it? 


