In [None]:
!pip install langchain langchain_community langchain_openai pydantic tqdm
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes mistral-common

In [None]:
import os
import re
import json
import time
import zipfile
import tempfile
import random
import pandas as pd
import asyncio # for parallel processing
from typing import List, Dict
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from tqdm.asyncio import tqdm_asyncio # for progress
from datasets import Dataset, load_dataset
from unsloth.chat_templates import get_chat_template

**Preprocessing of Whatsapp Data**

In [None]:
# ----------- FILTER FUNCTIONS -----------

# ----------- Pre-Merge Filters -----------
def filter_system_messages(msg: str) -> bool:
    system_patterns = [
        "Messages and calls are end-to-end encrypted",
        "changed this group's icon",
        "left",
        "added",
        "removed",
        "created group",
        "Learn more"
    ]
    return any(pat.lower() in msg.lower() for pat in system_patterns)

def filter_media_and_deleted(msg: str) -> bool:
    patterns = [
        "<Media omitted>",
        "<This message was edited>",
        "This message was deleted",
        "You deleted this message",
        "(file attached)",
        "null"
    ]
    return any(pat.lower() in msg.lower() for pat in patterns)

def filter_links(msg: str) -> bool:
    return bool(re.search(r"http[s]?://|www\.", msg))


def filter_empty_or_null(msg: str) -> bool:
    return len(msg.strip()) == 0

def filter_emails(msg: str) -> bool:
    return bool(re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", msg))

def filter_fillers(msg: str) -> bool:
    filler_words = {
        "ok", "okay", "haan", "hmm", "hmmm", "h", "huh", "hahaha", "haha",
        "lol", "hlo", "hii", "yeah", "hye", "hi", "bye", "hmmm", "hmm", "accha"
    }
    return msg.strip().lower() in filler_words

def filter_single_char(msg: str) -> bool:
    return len(msg.strip()) <= 1

# ----------- PARSING FUNCTION -----------
def parse_whatsapp_txt(file_path):
    """
    Parse WhatsApp exported .txt and combine continuation lines that belong to the
    same timestamp into a single message chunk (per timestamp).
    Returns: list of (sender, message) tuples where message can be multi-line.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    chat_data = []
    pattern = r"^\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}\s*(?:AM|PM|am|pm)? - (.*?): (.*)$"

    current_sender = None
    current_msg = []

    for line in lines:
        line = line.rstrip("\n")
        match = re.match(pattern, line)
        if match:
            # Save previous message block
            if current_sender is not None:
                chat_data.append((current_sender, "\n".join(current_msg)))
            sender, msg = match.groups()
            current_sender = sender
            current_msg = [msg]
        else:
            # Continuation line (belongs to previous timestamp)
            if current_sender is not None:
                current_msg.append(line)

    # append last block
    if current_sender is not None:
        chat_data.append((current_sender, "\n".join(current_msg)))

    return chat_data

# ----------- Post-Merge Filters -----------
def filter_code_blocks(msg: str) -> bool:
    """Detect if a whole message block is code (post-merge)."""
    code_indicators = ["def ", "class ", "#include", "{", "}", ";", "public static void", "import ", "printf(", "console.log"]
    if any(ci in msg for ci in code_indicators):
        return True
    # Symbol density check
    symbol_count = sum(1 for c in msg if not c.isalnum() and not c.isspace())
    if symbol_count / max(1, len(msg)) > 0.30:
        return True
    # Multiple lines with indentation
    if "\n" in msg and any(line.startswith("    ") or line.startswith("\t") for line in msg.splitlines()):
        return True
    return False

def filter_long_text(msg: str) -> bool:
    # post-merge: treat as long/academic only if quite long
    return len(msg.split()) > 80

In [None]:
# ----------- HYBRID MERGE + FILTER (pre-merge filters + post-merge filters) -----------
def filter_message_pre(msg: str) -> bool:
    """
    Pre-merge filters applied per original message chunk.
    Return True if message should be DROPPED at pre-filter stage.
    These are lightweight: system messages, media/null, links, emails, fillers, single-char, empty.
    """
    if filter_system_messages(msg): return True
    if filter_media_and_deleted(msg): return True
    if filter_links(msg): return True
    if filter_emails(msg): return True
    if filter_fillers(msg): return True
    if filter_single_char(msg): return True
    if filter_empty_or_null(msg): return True
    return False

def why_filtered_merged(msg: str):
    """Return first matching post-merge filter reason or None."""
    if filter_code_blocks(msg): return "code_block"
    if filter_long_text(msg): return "long_academic_text"
    return None

def merge_and_filter(chat_data, debug=False):
    """
    Hybrid approach:
    1) Apply pre-filters to each original chat_data message individually (filter_message_pre).
       Keep only the messages that pass pre-filters.
    2) Merge consecutive kept messages by the same sender into a single block.
    3) Apply post-merge filters (filter_code_blocks, filter_long_academic_text) to the merged block.
       If a merged block fails post-filters, drop it and (optionally) print debug info.
    """
    # Step 1: apply pre-filters per-message and produce a cleaned list preserving order
    cleaned = []
    for sender, msg in chat_data:
        if filter_message_pre(msg):
            continue
        cleaned.append((sender, msg))

    # Step 2: merge consecutive cleaned messages by same sender
    merged = []
    cur_sender = None
    cur_msgs = []
    for sender, msg in cleaned:
        if sender == cur_sender:
            cur_msgs.append(msg)
        else:
            if cur_sender is not None:
                full_msg = " ".join(cur_msgs).strip()
                # Step 3: apply post-merge filters
                post_reason = why_filtered_merged(full_msg)
                if post_reason is None:
                    merged.append((cur_sender, full_msg))
                else:
                    if debug:
                        preview = full_msg if len(full_msg) < 300 else full_msg[:300] + "..."
                        print(f"[POST-DROP] sender={cur_sender} reason={post_reason}")
                        print("  preview:", preview.replace("\n"," \\n "))
                    # drop the merged block entirely (as specified)
                # reset
            cur_sender = sender
            cur_msgs = [msg]

    # flush last
    if cur_sender is not None:
        full_msg = " ".join(cur_msgs).strip()
        post_reason = why_filtered_merged(full_msg)
        if post_reason is None:
            merged.append((cur_sender, full_msg))
        else:
            if debug:
                preview = full_msg if len(full_msg) < 300 else full_msg[:300] + "..."
                print(f"[POST-DROP] sender={cur_sender} reason={post_reason}")
                print("  preview:", preview.replace("\n"," \\n "))

    return merged

# ----------- PAIR CREATION -----------
def create_prompt_response_pairs(chat_data, my_names):
    """
    chat_data should be the merged + post-filtered list returned by merge_and_filter.
    This function collects multiple consecutive user messages and assistant messages
    and pairs them only when both sides have content. Skips assistant-leading blocks.
    """
    pairs = []
    i = 0
    n = len(chat_data)
    while i < n:
        sender, msg = chat_data[i]

        if sender not in my_names:
            # collect consecutive user messages
            user_parts = [msg]
            i += 1
            while i < n and chat_data[i][0] not in my_names:
                user_parts.append(chat_data[i][1])
                i += 1
            user_msg = " ".join(user_parts).strip()

            # collect consecutive assistant messages
            assistant_parts = []
            while i < n and chat_data[i][0] in my_names:
                assistant_parts.append(chat_data[i][1])
                i += 1

            if assistant_parts:
                pairs.append({
                    "messages": [
                        {"role": "user", "content": user_msg},
                        {"role": "assistant", "content": " ".join(assistant_parts).strip()}
                    ]
                })
        else:
            # assistant started the chat; skip until a user message arrives
            i += 1

    return pairs

In [None]:
# ----------- MAIN PIPELINE -----------
def process_single_chat(file_path, my_names, debug=False):
    raw_data = parse_whatsapp_txt(file_path)
    merged_filtered = merge_and_filter(raw_data)  # debug param unused here
    pairs = create_prompt_response_pairs(merged_filtered, my_names)
    return pairs

def process_zip(zip_path, my_names, output_path, debug=False):
    all_pairs = []
    with tempfile.TemporaryDirectory() as tmpdir:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(tmpdir)

        for root, _, files in os.walk(tmpdir):
            for fname in files:
                if fname.lower().endswith(".txt"):
                    file_path = os.path.join(root, fname)
                    pairs = process_single_chat(file_path, my_names, debug=debug)
                    all_pairs.extend(pairs)
                    print(f"[INFO] {fname}: {len(pairs)} pairs")

    with open(output_path, "w", encoding="utf-8") as f:
        for p in all_pairs:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"\n[SUMMARY] Total pairs from all chats: {len(all_pairs)} → saved to {output_path}")


# NOTE: It only works for individual chats and not for group chats
if __name__ == "__main__":
    zip_path = "/content/whatsapp chats.zip"
    my_name = ["Keshav Agarwal", "Keshav Agrawal","Thekedaar", "Keshav", "Keshav Bhaiya", "Keshav Agarwal BSBE", "Keshav Bhai Wingee😎😎", "Keshav Bro Agra", "KeshavAgarwal, BSBE,Y22,IITK", "KESHAV Agrawal Iitk Bsbe", "Keshav Agarwal Y22"]
    output_path = "all_whatsapp_chat.jsonl"
    process_zip(zip_path, my_name, output_path, debug=True)


[INFO] WhatsApp Chat with Keshav Bro Agra.txt: 17 pairs
[INFO] WhatsApp Chat with Keshav Agrawal.txt: 177 pairs
[INFO] WhatsApp Chat with Thekedaar.txt: 300 pairs
[INFO] WhatsApp Chat with Keshav.txt: 161 pairs
[INFO] WhatsApp Chat with Keshav Bhai Wingee😎😎.txt: 171 pairs
[INFO] WhatsApp Chat with KESHAV Agrawal Iitk Bsbe.txt: 495 pairs
[INFO] WhatsApp Chat with Keshav2.txt: 20 pairs
[INFO] WhatsApp Chat with Keshav Agarwal BSBE.txt: 356 pairs
[INFO] WhatsApp Chat with Keshav Bhaiya.txt: 132 pairs
[INFO] WhatsApp Chat with Keshav1.txt: 86 pairs
[INFO] WhatsApp Chat with KeshavAgarwal, BSBE,Y22,IITK.txt: 41 pairs

[SUMMARY] Total pairs from all chats: 1956 → saved to all_whatsapp_chat.json


**This is for Generating more pairs on diverse topics**

In [None]:
# --- Your OpenAI API Key ---
OPENAI_API_KEY = "XXXX" # or put it as secret key in the colab environment

# Path to your real dataset
ORIGINAL_DATA_PATH = "/content/all_whatsapp_chat.jsonl"

# --- Load real dataset ---
with open(ORIGINAL_DATA_PATH, "r", encoding="utf-8") as f:
    real_data = [json.loads(line) for line in f]

# Pick 25 random examples for style reference
style_examples = random.sample(real_data, 25)
style_text = "\n".join(json.dumps(ex, ensure_ascii=False) for ex in style_examples)

# --- LangChain LLM instance ---
llm = ChatOpenAI(
    model="gpt-4.1",
    temperature=0.6,
    openai_api_key=OPENAI_API_KEY
)

# --- Topics ---
TOPICS = [
    "exam results", "planning a trip", "funny incident at school", "arguing over a movie",
    "teasing about cricket match", "asking for homework help", "joking about the weather",
    "late-night random talk", "ordering food", "gaming banter", "arguing about favorite song",
    "sharing a meme reaction", "complaining about traffic", "discussing marks",
    "morning wake-up texts", "festival greetings", "inside joke about teacher",
    "teasing about dress sense", "deciding where to eat", "commenting on selfie",
    "mock fight about losing a game", "asking for money", "complaining about cold coffee",
    "celebrating small win", "making fun of handwriting", "talking about school trip",
    "mock interview prep", "commenting on weird dream", "debating over chai vs coffee",
    "sharing gossip", "random philosophical thought", "political debates", "existential crisis",
    "about the role of god", "discussing what life is", "discussion about goals"
]

# --- Prompt with style examples ---
STYLE_PROMPT = f"""
You are an AI that writes short, casual Hindi-English (Hinglish) conversations.
Your goal is to replicate EXACTLY the tone, humor, and personality of the assistant in my dataset.

Tone: Should be exactly same as to the tone of the assistant in the dataset provided to u.
You must never switch to a formal, robotic, or overly polite style.
Length: Keep conversations to 1–3 back-and-forth exchanges.
Avoid personal names and any identifying details.

Here are examples from my dataset. Match this style exactly:
{style_text}

Output only valid JSON in the exact format:
{{"messages": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}]}}
"""

# --- Generate one conversation for a given topic ---
def generate_convo(topic):
    messages = [
        SystemMessage(content=STYLE_PROMPT),
        HumanMessage(content=f"Write one short Hinglish chat conversation about: {topic}")
    ]
    response = llm.invoke(messages)
    try:
        return json.loads(response.content)
    except json.JSONDecodeError:
        return None

# --- Adjust these ---
TARGET_TOTAL = 1000
BATCH_SIZE = 50
OUTPUT_FILE = "synthetic.jsonl"

# Load progress if exists
synthetic_data = []
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        synthetic_data = [json.loads(line) for line in f]

print(f"▶ Resuming... already have {len(synthetic_data)} synthetic convos.")

# Generation loop
while len(synthetic_data) < TARGET_TOTAL:
    batch = []
    for _ in range(BATCH_SIZE):
        topic = random.choice(TOPICS)
        convo = generate_convo(topic)
        if convo:
            batch.append(convo)

    # Append to file
    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
        for conv in batch:
            f.write(json.dumps(conv, ensure_ascii=False) + "\n")

    synthetic_data.extend(batch)
    print(f"✅ Saved {len(synthetic_data)}/{TARGET_TOTAL} synthetic convos.")

    # Sleep to avoid hitting rate limit
    time.sleep(10)  # Adjust based on your limit

print("🎯 Finished generating synthetic dataset!")

▶ Resuming... already have 800 synthetic convos.


KeyboardInterrupt: 

**This is for merging our original jsonl file with the synthetic jsonl file**

In [None]:
# Paths to your datasets
ORIGINAL_DATA_PATH = "/content/all_whatsapp_chat.jsonl"
SYNTHETIC_DATA_PATH = "/content/synthetic.jsonl"
MERGED_OUTPUT_PATH = "merged.jsonl"

# Load original dataset
with open(ORIGINAL_DATA_PATH, "r", encoding="utf-8") as f:
    original_data = [json.loads(line) for line in f]

# Load synthetic dataset
with open(SYNTHETIC_DATA_PATH, "r", encoding="utf-8") as f:
    synthetic_data = [json.loads(line) for line in f]

# Merge
merged_data = original_data + synthetic_data

# Save merged dataset
with open(MERGED_OUTPUT_PATH, "w", encoding="utf-8") as f:
    for conv in merged_data:
        f.write(json.dumps(conv, ensure_ascii=False) + "\n")

print(f"✅ Merged {len(original_data)} original + {len(synthetic_data)} synthetic = {len(merged_data)} total conversations.")


✅ Merged 1956 original + 800 synthetic = 2756 total conversations.


In [None]:
from google.colab import userdata # this is how we load the secret keys from the colab environment
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

**This creates 4 new conversation pairs for every pair that exist, beoz we want as much data as possible**

In [None]:
# --- 1. Configuration ---
INPUT_FILENAME = "/content/merged.jsonl"
OUTPUT_FILENAME = "/content/final_whatsapp_dataset.jsonl"
MAX_CONCURRENT_REQUESTS = 10

def load_dataset(filename: str) -> list:
    """
    Loads a dataset from a JSON Lines (.jsonl) file, where each line
    is a separate JSON object.
    """
    data = []
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            for line_number, line in enumerate(f, 1):
                if line.strip():
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError:
                        print(f"Error: Could not decode JSON on line {line_number} in '{filename}'")
        return data
    except FileNotFoundError:
        print(f"Error: Input file '{filename}' not found.")
        return []

# --- 2. LangChain Pydantic Model---
class DiverseResponses(BaseModel):
    responses: List[str] = Field(description="A list of exactly 4 diverse assistant responses.")

# --- 3. The Asynchronous Core Logic---
async def process_conversation_async(
    session,
    semaphore: asyncio.Semaphore,
    dataset: List[dict],
    conversation_index: int
) -> List[Dict]:
    """Processes a single conversation asynchronously using its own context."""
    async with semaphore:
        current_convo = dataset[conversation_index]
        user_query = current_convo["messages"][0]["content"]
        assistant_response = current_convo["messages"][1]["content"]

        try:
            response_data = await session.ainvoke({
                "user_query": user_query,
                "assistant_response_example": assistant_response,
            })

            new_responses = response_data.get("responses", [])
            if not new_responses or len(new_responses) != 4:
                print(f"⚠️ Warning: Did not get 4 responses for query '{user_query}'")
                return [current_convo]

            result = [current_convo]
            for resp_content in new_responses:
                result.append({"messages": [{"role": "user", "content": user_query}, {"role": "assistant", "content": resp_content}]})
            return result

        except Exception as e:
            print(f"Error processing query '{user_query}': {e}")
            return [current_convo]

# --- 4. Main Asynchronous Function---
async def main():
    if "OPENAI_API_KEY" not in os.environ:
        print("Error: OPENAI_API_KEY environment variable not set.")
        return

    full_dataset = load_dataset(INPUT_FILENAME)

    # Setup LangChain Chain
    json_parser = JsonOutputParser(pydantic_object=DiverseResponses)

    model = ChatOpenAI(model="gpt-4o-mini", temperature=0.7, model_kwargs={"response_format": {"type": "json_object"}})

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", "You are a data augmentation assistant. Your task is to generate new responses that are highly similar in tone, style, and informal nature to the example provided. The responses should be conversational and sound like a real person chatting. You must strictly output valid JSON."),
        ("user", """Learn my 'assistant' tone from this single example:
        User: "{user_query}"
        Assistant: "{assistant_response_example}"

        Now, generate exactly 4 new, and realistic assistant responses for the user query: **"{user_query}"**

        {format_instructions}""")
    ])

    chain = prompt_template.partial(format_instructions=json_parser.get_format_instructions()) | model | json_parser

    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

    # Process the entire dataset
    num_to_process = len(full_dataset)

    print(f"🚀 Preparing to process all {num_to_process} conversations...")

    tasks = [process_conversation_async(chain, semaphore, full_dataset, i) for i in range(num_to_process)]
    results_list_of_lists = await tqdm_asyncio.gather(*tasks)

    final_data = [item for sublist in results_list_of_lists for item in sublist]

    print(f"\n🎉 Augmentation complete. Saving {len(final_data)} total pairs to '{OUTPUT_FILENAME}'...")
    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        for entry in final_data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    print("✅ Done.")

# --- Run the async main function ---
await main()


🚀 Preparing to process all 2756 conversations...


  1%|          | 23/2756 [00:05<08:48,  5.17it/s]



 34%|███▍      | 939/2756 [03:09<05:37,  5.39it/s]



 40%|████      | 1115/2756 [03:41<04:07,  6.64it/s]



 43%|████▎     | 1188/2756 [03:54<04:58,  5.25it/s]



 78%|███████▊  | 2136/2756 [06:50<01:33,  6.65it/s]



 82%|████████▏ | 2257/2756 [07:14<02:00,  4.14it/s]



 82%|████████▏ | 2270/2756 [07:16<01:35,  5.11it/s]



 83%|████████▎ | 2283/2756 [07:19<01:14,  6.35it/s]



100%|██████████| 2756/2756 [08:44<00:00,  5.25it/s]


🎉 Augmentation complete. Saving 13748 total pairs to '/content/final_whatsapp_dataset.jsonl'...
✅ Done.





**Fine Tuning Part using Unsloth**

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any!
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.9 (you have 3.11.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.5: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,  # Slightly higher than r for more stable learning
    lora_dropout = 0.05,  # Small dropout helps avoid overfitting on small dataset
    bias = "none",
    use_gradient_checkpointing = "unsloth",  # Saves VRAM for long context
    random_state = 3407,
    use_rslora = False,  # Stabilizes LoRA updates; worth enabling on large models
    loftq_config = None
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.8.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
# Read the JSONL that your process_zip function produced
jsonl_path = "/content/final_whatsapp_dataset.jsonl"

with open(jsonl_path, "r", encoding="utf-8") as f:
    conversations = [json.loads(line)["messages"] for line in f]

# Now conversations is a list like:
# [
#   [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello"}],
#   [{"role": "user", "content": "What's up?"}, {"role": "assistant", "content": "All good"}],
#   ...
# ]

dataset = Dataset.from_dict({"messages": conversations})

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Matches LLaMA 3.x style
)

def formatting_prompts_func(examples):
    texts = [
        tokenizer.apply_chat_template(
            convo,  # this is already a list of role/content dicts
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in examples["messages"]
    ]
    return {"text": texts}

# Apply formatting to every conversation
dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/13748 [00:00<?, ? examples/s]

In [None]:
dataset[10]

{'messages': [{'content': 'Tumhari Branch ka Hai Bhavya Chhota chhota',
   'role': 'user'},
  {'content': 'Accha haan', 'role': 'assistant'}],
 'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nTumhari Branch ka Hai Bhavya Chhota chhota<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAccha haan<|eot_id|>'}

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/13748 [00:00<?, ? examples/s]

**Training**

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 13,748 | Num Epochs = 1 | Total steps = 1,719
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


Step,Training Loss
1,2.8522
2,3.0097
3,2.2106
4,2.8118
5,2.0973
6,2.708
7,2.4907
8,3.3978
9,3.0354
10,2.9569


**For Inferencing**

In [None]:
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer
from unsloth import FastLanguageModel

# 1. Chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)

# 2. Enable fast inference
FastLanguageModel.for_inference(model)

# 3. Example conversation
messages = [
    {"from": "human", "value": "aur bata"},
]

# 4. Tokenize
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

# 5. Stream output
text_streamer = TextStreamer(tokenizer)
output = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
)


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

aur bata<|eot_id|><|start_header_id|>assistant<|end_header_id|>

bata kya chal raha hai?<|eot_id|>


In [None]:
model.save_pretrained("lora_model") # Local saving

In [None]:
from google.colab import files
import shutil

# Compress the folder
shutil.make_archive("lora_model", 'zip', "lora_model")

# Download the zip
files.download("lora_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Below part is for making the model compatible for working with ollama**

In [None]:
# Clone llama.cpp repo with submodules
!git clone --recursive https://github.com/ggerganov/llama.cpp

fatal: destination path 'llama.cpp' already exists and is not an empty directory.


In [None]:
%cd llama.cpp

/content/llama.cpp


In [None]:
# Compile the binaries
!mkdir build
%cd build
!cmake ..
!cmake --build . --config Release

/content/llama.cpp/build
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- x86 detected
-- Adding CPU backen

In [None]:
%cd /content/

/content


In [None]:
model.save_pretrained_gguf("last_model", tokenizer)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.79 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [07:21<00:00, 13.81s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving last_model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving last_model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving last_model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving last_model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at last_model into q8_0 GGUF format.
The output location will be /content/last_model/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: last_model
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape =