In [20]:
import pandas as pd
import re
from datasets import Dataset
from torch import bfloat16
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig




In [21]:
# Load dataset
data = pd.read_csv("Copy of train.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Copy of train.csv'

In [4]:
data.shape

(1492, 1268)

In [5]:
# Inspect the first few rows and columns
data.head(2)

Unnamed: 0,text,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 1258,Unnamed: 1259,Unnamed: 1260,Unnamed: 1261,Unnamed: 1262,Unnamed: 1263,Unnamed: 1264,Unnamed: 1265,Unnamed: 1266,Unnamed: 1267
0,"Delroy Garrett, Jr. grew up to become a track ...",,,,,,,,,,...,,,,,,,,,,
1,He was one of the many prisoners of Indian Hil...,,,,,,,,,,...,,,,,,,,,,


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1492 entries, 0 to 1491
Columns: 1268 entries, text to Unnamed: 1267
dtypes: float64(140), object(1128)
memory usage: 14.4+ MB


In [7]:
# Display column names to identify unnamed columns
print(data.columns)

Index(['text', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       ...
       'Unnamed: 1258', 'Unnamed: 1259', 'Unnamed: 1260', 'Unnamed: 1261',
       'Unnamed: 1262', 'Unnamed: 1263', 'Unnamed: 1264', 'Unnamed: 1265',
       'Unnamed: 1266', 'Unnamed: 1267'],
      dtype='object', length=1268)


In [8]:
data.isnull().sum()

Unnamed: 0,0
text,111
Unnamed: 1,1492
Unnamed: 2,1492
Unnamed: 3,1492
Unnamed: 4,1492
...,...
Unnamed: 1263,1491
Unnamed: 1264,1491
Unnamed: 1265,1491
Unnamed: 1266,1491


In [9]:
# Drop unnamed columns (if they don't contain useful data)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

# Display the cleaned DataFrame
data.head()

Unnamed: 0,text
0,"Delroy Garrett, Jr. grew up to become a track ..."
1,He was one of the many prisoners of Indian Hil...
2,"Richard ""Rick"" Jones was orphaned at a young ..."
3,Aa is one of the more passive members of the P...
4,Aaron Cash is the head of security at Arkham A...


In [10]:
data.isnull().sum()

Unnamed: 0,0
text,111


In [11]:
data = data.dropna(ignore_index=True)

In [12]:
data

Unnamed: 0,text
0,"Delroy Garrett, Jr. grew up to become a track ..."
1,He was one of the many prisoners of Indian Hil...
2,"Richard ""Rick"" Jones was orphaned at a young ..."
3,Aa is one of the more passive members of the P...
4,Aaron Cash is the head of security at Arkham A...
...,...
1376,Zero was created by the late Dr. Albert Wily ...
1377,which was absolutely loyal to Dr. Weil's orde...
1378,"Hunter Zolomon is better known as Zoom, a spee..."
1379,Hunter Zolomon had a troubled relationship wi...


In [13]:
data['text'].duplicated().sum()

12

In [14]:
data[data['text'].duplicated()]

Unnamed: 0,text
129,After witnessing the brutal murder of his pare...
487,Much of Madison's past is unknown. He and his ...
794,Childhood Trauma Robert Bruce Banner was the s...
815,History unknown.
830,Melinda May was born to William and to the int...
950,Percy was born to Sally Jackson and Poseidon. ...
967,id='history'
1045,but Tim was successfully lifted out with some...
1183,History unknown.
1273,id='history'


In [15]:
data = data.drop_duplicates(ignore_index=True)

In [16]:
data

Unnamed: 0,text
0,"Delroy Garrett, Jr. grew up to become a track ..."
1,He was one of the many prisoners of Indian Hil...
2,"Richard ""Rick"" Jones was orphaned at a young ..."
3,Aa is one of the more passive members of the P...
4,Aaron Cash is the head of security at Arkham A...
...,...
1364,Zero was created by the late Dr. Albert Wily ...
1365,which was absolutely loyal to Dr. Weil's orde...
1366,"Hunter Zolomon is better known as Zoom, a spee..."
1367,Hunter Zolomon had a troubled relationship wi...


In [17]:
data.isnull().sum()

Unnamed: 0,0
text,0


In [18]:
data.duplicated().sum()

0

In [19]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

In [20]:
# Apply cleaning to the 'text' column
data_cleaned = data['text'].apply(clean_text)

In [21]:
data_cleaned = pd.DataFrame(data_cleaned)

In [22]:
# Convert the cleaned DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data_cleaned)

# Inspect the Hugging Face dataset
print(hf_dataset)

Dataset({
    features: ['text'],
    num_rows: 1369
})


In [23]:
# Our 4-bit configuration to load the LLM with less GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

In [24]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [25]:
# Initialize the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [26]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm

In [27]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [28]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=32) # max_length=32 because fine tuning taking hours

In [29]:
# Apply tokenization to the dataset
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1369 [00:00<?, ? examples/s]

In [30]:
# Load LoRA configuration
from peft import TaskType
peft_args = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type=TaskType.CAUSAL_LM, # or use #task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

In [31]:
# Create SFTConfig instead of using arguments directly in SFTTrainer
sft_config = SFTConfig(
    output_dir="./results",  # Set output directory
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    dataset_text_field="text",  # Set the text field
    max_seq_length=32,  # Set to your desired max length 512
)

In [33]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=peft_args,
    args=sft_config,  # Use SFTConfig here
    tokenizer=tokenizer,
    #args=training_params,
    packing=False,
)

In [34]:
# Fine-tune the model using SFTTrainer
trainer.train()

Step,Training Loss
25,3.8962


TrainOutput(global_step=43, training_loss=3.7499080924100654, metrics={'train_runtime': 518.1939, 'train_samples_per_second': 2.642, 'train_steps_per_second': 0.083, 'total_flos': 1876188803629056.0, 'train_loss': 3.7499080924100654, 'epoch': 1.0})

In [35]:
# Save the fine-tuned model
trainer.model.save_pretrained("./zephyr_finetuned")
trainer.tokenizer.save_pretrained("./zephyr_finetuned")

('./zephyr_finetuned/tokenizer_config.json',
 './zephyr_finetuned/special_tokens_map.json',
 './zephyr_finetuned/tokenizer.model',
 './zephyr_finetuned/added_tokens.json',
 './zephyr_finetuned/tokenizer.json')

In [36]:
from transformers import GenerationConfig

In [37]:
# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./zephyr_finetuned", quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [38]:
new_tokenizer = AutoTokenizer.from_pretrained("./zephyr_finetuned", trust_remote_code=True, clean_up_tokenization_spaces=True)
# new_tokenizer.pad_token = new_tokenizer.eos_token
new_tokenizer.padding_side = "right"

In [40]:
# Set generation config for inference
generation_config = GenerationConfig(
    max_new_tokens=150,  # Adjust to the desired number of tokens
    num_beams=5,         # Number of beams for beam search; higher values improve output quality but increase computation
    temperature=0.7,     # Adjust to control randomness
    top_k=50,            # Limits the sampling pool to the top k most likely next tokens
    top_p=0.85,          # Nucleus sampling
    pad_token_id=tokenizer.eos_token_id,  # Ensuring EOS token is used for padding
    repetition_penalty=1.2,  # Penalizes repeated tokens to reduce redundancy in generated text
    do_sample=True           # Enables sampling; if False, generates text using beam search
)

In [41]:
# Input prompt
input_text = "In a distant future, humanity discovered time travel."

In [42]:
# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

In [43]:
# Generate text
outputs = fine_tuned_model.generate(**inputs, generation_config=generation_config)

In [44]:
# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In a distant future, humanity discovered time travel. They used it to go back in time to prevent the extinction of the dinosaurs. This led to the creation of a new timeline where the dinosaurs still roam the earth.

In this new timeline, a group of scientists and explorers set out on a mission to find the lost city of Atlantis. Along the way, they encountered many dangers, including prehistoric beasts and rival explorers.

As they got closer to their goal, they discovered that Atlantis was not just a lost city, but a powerful force that could change the course of history. They had to decide whether to use its power for good or let it fall into the wrong hands.




In [14]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,  # Make sure this is included
    TrainingArguments,
)

In [15]:
from torch import bfloat16

In [16]:
import accelerate

In [17]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True)  # Efficient memory usage
#    llm_int8_threshold=6.0)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Ensure the tokenizer pads sequences correctly
tokenizer.pad_token = tokenizer.eos_token


ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`

In [22]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta")

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

KeyboardInterrupt: 