In [1]:
import torch
from transformers import GPT2Config, GPT2LMHeadModel,AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments,AutoTokenizer
from peft import LoraConfig,get_peft_model
# tokenizer=AutoTokenizer.from_pretrained('tirthadagr8/custom-mbart-large-50')


In [2]:
tokenizer=AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-1B')
tokenizer.pad_token = tokenizer.eos_token

In [3]:
import os

os.environ['NEPTUNE_API_TOKEN'] = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2ZGUwMDYyOC04NmE0LTQyM2UtOTVjNi0wZjQ3ZGU2ZjM4M2IifQ=="
os.environ['NEPTUNE_PROJECT'] = 'tirthadagr8/model-feed'
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
# config=GPT2Config(vocab_size=len(tokenizer),bos_token_id=tokenizer.bos_token_id,eos_token_id=tokenizer.eos_token_id,n_embd=384,n_layer=12,n_head=8)
# no_of_parameters=config.vocab_size*config.n_embd+config.n_layer*config.n_embd+config.n_layer*(4*config.n_embd*config.n_embd+4*config.n_embd+2*config.n_embd*4*config.n_embd+9*config.n_embd)+2*config.n_embd
# size_of_model=no_of_parameters/(1.6*100000000)
# print(f'Number of parameters would be:{no_of_parameters} the size would be:{size_of_model}')

In [5]:
torch_dtype=torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


In [6]:
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.2-1B',quantization_config=bnb_config,device_map="auto",)

In [7]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [8]:
with open('C:/Users/tirth/Desktop/english_corpus.txt','r') as f:
    text=f.readline()
chunk_size = 128  # You can adjust this value
array_of_strings = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

In [7]:
# for c in sorted(list(set(text))):
#     if c not in tokenizer.vocab:
#         print(c)

In [12]:
from datasets import load_dataset,Dataset
from transformers import LlamaTokenizer

# Load text data
dataset = Dataset.from_dict({'text':array_of_strings})
split_dataset = dataset.train_test_split(test_size=0.1)  # 80% train, 20% test
# Initialize the tokenizer
# tokenizer = LlamaTokenizer.from_pretrained('huggingface/llama-tokenizer')  # or use a compatible LLaMA tokenizer

# Tokenize the dataset
def tokenize_function(examples):
    '''
    *** this will be used in future to train it for conversation format training
    messages = [{"role": "user", "content": "What is the capital of France."}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False)
    '''
    # return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    texts=[]
    for text in examples['text']:
        texts.append(text+tokenizer.eos_token)
    return {'text':texts}

tokenized_dataset = split_dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

Map:   0%|          | 0/66779 [00:00<?, ? examples/s]

Map:   0%|          | 0/7420 [00:00<?, ? examples/s]

In [13]:
train_dataset['text'][0]

't to him in person. She trusted her friend, but so much could happen. She waited impatiently for word.Hopes and dreams were dash<|end_of_text|>'

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "outputs",
    overwrite_output_dir=True,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 60,
    learning_rate = 2e-4,
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    report_to = "none", # Use this for WandB etc
)

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
# from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 128,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map (num_proc=2):   0%|          | 0/74199 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [16]:
trainer.train()

  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 3.7574, 'grad_norm': 3.1946589946746826, 'learning_rate': 4e-05, 'epoch': 0.0}
{'loss': 4.2136, 'grad_norm': 8.23609447479248, 'learning_rate': 8e-05, 'epoch': 0.0}
{'loss': 4.4372, 'grad_norm': 3.507418155670166, 'learning_rate': 0.00012, 'epoch': 0.0}
{'loss': 3.7351, 'grad_norm': 3.453597068786621, 'learning_rate': 0.00016, 'epoch': 0.0}
{'loss': 4.1614, 'grad_norm': 3.5995664596557617, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 3.5713, 'grad_norm': 3.7305750846862793, 'learning_rate': 0.00019636363636363636, 'epoch': 0.0}
{'loss': 4.2325, 'grad_norm': 3.490417957305908, 'learning_rate': 0.00019272727272727274, 'epoch': 0.0}
{'loss': 3.9536, 'grad_norm': 4.045252323150635, 'learning_rate': 0.0001890909090909091, 'epoch': 0.0}
{'loss': 4.4251, 'grad_norm': 5.286133766174316, 'learning_rate': 0.00018545454545454545, 'epoch': 0.0}
{'loss': 3.5718, 'grad_norm': 4.190104007720947, 'learning_rate': 0.00018181818181818183, 'epoch': 0.0}
{'loss': 3.7556, 'grad_norm': 4.3943581

TrainOutput(global_step=60, training_loss=3.6743915915489196, metrics={'train_runtime': 50.1597, 'train_samples_per_second': 2.392, 'train_steps_per_second': 1.196, 'total_flos': 21050801012736.0, 'train_loss': 3.6743915915489196, 'epoch': 0.0016172724699793797})

In [25]:
msg='hi! how are you?'
inputs = tokenizer(msg, return_tensors='pt', padding=True, 
                   truncation=True, max_length=128).to("cuda")
outputs = model.generate(**inputs, max_length=100, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [26]:
tokenizer.batch_decode(outputs)

["<|begin_of_text|>hi! how are you? it's been a long day and I'm about to fall asleep. I know I should be working, but I'm tired and want to go to bed. I know that I should be doing something productive, but I'm just too tired to even think about it. I'm going to bed and I'm going to sleep and I'm going to wake up tomorrow.<|end_of_text|>"]

In [27]:
tokenizer.batch_decode(inputs['input_ids'])

['<|begin_of_text|>hi! how are you?']