In [None]:
pip install requests




In [None]:
!pip install langchain_mistralai

Collecting langchain_mistralai
  Downloading langchain_mistralai-0.2.11-py3-none-any.whl.metadata (2.0 kB)
Downloading langchain_mistralai-0.2.11-py3-none-any.whl (16 kB)
Installing collected packages: langchain_mistralai
Successfully installed langchain_mistralai-0.2.11


In [4]:
import json, re
from collections import Counter

# ==== CONFIG ====
IN_JSONL  = "/content/resumes.jsonl"   # <- change path to your file
OUT_JSONL = "/content/qa_clean_user_assistant.jsonl"

def first_key(d):
    return next(iter(d.keys())) if isinstance(d, dict) and d else None

def get_from_messages(obj):
    """
    Handle chat-like records:
    {"messages": [{"role":"system","content":"..."},{"role":"user","content":"..."},{"role":"assistant","content":"..."}]}
    We take the LAST user turn and the FIRST assistant after it.
    """
    msgs = obj.get("messages")
    if not isinstance(msgs, list) or not msgs:
        return None, None
    # Grab last user content
    user_text = None
    for m in reversed(msgs):
        if isinstance(m, dict) and m.get("role", "").lower() == "user":
            user_text = str(m.get("content", "")).strip()
            break
    if not user_text:
        return None, None
    # Find assistant after that user in original order
    assistant_text = None
    seen_user = False
    for m in msgs:
        if isinstance(m, dict) and m.get("role", "").lower() == "user" and str(m.get("content","")).strip() == user_text:
            seen_user = True
            continue
        if seen_user and isinstance(m, dict) and m.get("role","").lower() == "assistant":
            assistant_text = str(m.get("content","")).strip()
            break
    return user_text, assistant_text

def extract_qa(obj):
    """
    Try multiple common schemas to get (user, assistant).
    Return (None, None) if not found.
    """
    # 1) messages schema
    u, a = get_from_messages(obj)
    if u and a:
        return u, a

    # 2) direct keys (try common variants)
    key_variants = [
        ("user", "assistant"),
        ("question", "answer"),
        ("input", "output"),
        ("prompt", "response"),
        ("instruction", "output"),
        ("human", "bot"),
        ("Human", "Bot"),
        ("User", "Assistant"),
    ]
    for uk, ak in key_variants:
        u = str(obj.get(uk, "") or "").strip()
        a = str(obj.get(ak, "") or "").strip()
        if u and a:
            return u, a

    # 3) some datasets use a single "text" with separators
    t = str(obj.get("text","") or "").strip()
    if t:
        # Try simple splits like "User: ...\nAssistant: ..."
        m = re.search(r"User:\s*(.+?)\s*Assistant:\s*(.+)", t, flags=re.S|re.I)
        if m:
            return m.group(1).strip(), m.group(2).strip()

    return None, None

# ---- PASS 1: scan keys to help debugging
key_counter = Counter()
lines = []
with open(IN_JSONL, "r", encoding="utf-8") as f:
    for ln, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            lines.append(obj)
            key_counter.update(obj.keys())
        except Exception as e:
            # skip bad lines
            continue

print("Total lines parsed:", len(lines))
print("Top keys seen:", key_counter.most_common(20))

# ---- PASS 2: extract pairs
pairs = []
for obj in lines:
    u, a = extract_qa(obj)
    if u and a:
        pairs.append({"prompt": f"User: {u}\nAssistant:", "completion": a})

print("Valid pairs found:", len(pairs))
print("Sample pairs:", pairs[:3])

# If still zero, print a few raw samples to inspect
if len(pairs) == 0:
    print("\nNo pairs extracted. Here are 3 raw records to inspect keys/shapes:\n")
    for rec in lines[:3]:
        print(json.dumps(rec, ensure_ascii=False)[:800], "\n---")
else:
    # write cleaned JSONL
    with open(OUT_JSONL, "w", encoding="utf-8") as w:
        for r in pairs:
            w.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"\nWrote cleaned dataset to: {OUT_JSONL}")


Total lines parsed: 455
Top keys seen: [('messages', 455)]
Valid pairs found: 455
Sample pairs: [{'prompt': 'User: Why is it important to stay updated with the latest advancements in technology and industry trends?\nAssistant:', 'completion': 'Staying updated with the latest advancements in technology and industry trends is important because it helps professionals remain competitive, adapt to changes, and leverage new tools or methods to improve efficiency. It also enables individuals and organizations to anticipate market shifts, innovate, and make informed decisions that can lead to better project outcomes and business growth.'}, {'prompt': 'User: How does continuous learning benefit professionals in their careers?\nAssistant:', 'completion': 'Continuous learning benefits professionals by enhancing their skills, keeping them relevant in a rapidly changing job market, and opening up new career opportunities. It also fosters adaptability, problem-solving abilities, and personal growth,

In [5]:
!head -n 3 /content/qa_clean_user_assistant.jsonl


{"prompt": "User: Why is it important to stay updated with the latest advancements in technology and industry trends?\nAssistant:", "completion": "Staying updated with the latest advancements in technology and industry trends is important because it helps professionals remain competitive, adapt to changes, and leverage new tools or methods to improve efficiency. It also enables individuals and organizations to anticipate market shifts, innovate, and make informed decisions that can lead to better project outcomes and business growth."}
{"prompt": "User: How does continuous learning benefit professionals in their careers?\nAssistant:", "completion": "Continuous learning benefits professionals by enhancing their skills, keeping them relevant in a rapidly changing job market, and opening up new career opportunities. It also fosters adaptability, problem-solving abilities, and personal growth, which are valuable assets in any profession."}
{"prompt": "User: What are some effective ways to 

In [6]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="/content/qa_clean_user_assistant.jsonl", split="train")
print(dataset[0])


{'prompt': 'User: Why is it important to stay updated with the latest advancements in technology and industry trends?\nAssistant:', 'completion': 'Staying updated with the latest advancements in technology and industry trends is important because it helps professionals remain competitive, adapt to changes, and leverage new tools or methods to improve efficiency. It also enables individuals and organizations to anticipate market shifts, innovate, and make informed decisions that can lead to better project outcomes and business growth.'}


In [13]:
# ===== Minimal, version-proof LoRA SFT (no TRL needed) =====
# Works with plain Hugging Face Transformers + PEFT.
# Uses completion-only loss by masking prompt tokens to -100.

!pip -q install "transformers>=4.31.0" "datasets>=2.14.0" "peft>=0.6.0" accelerate

import json, random
from dataclasses import dataclass
from typing import List, Dict, Any
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments
)
from peft import LoraConfig, get_peft_model

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

DATA_JSONL = "/content/qa_clean_user_assistant.jsonl"   # path to cleaned file
MODEL_NAME = "facebook/opt-350m"                        # try a small model first
OUTPUT_DIR = "/content/qa_lora_out"
MAX_LEN = 512
BATCH = 2
EPOCHS = 1
LR = 2e-4

# ---------- Load dataset ----------
ds = load_dataset("json", data_files=DATA_JSONL, split="train")
splits = ds.train_test_split(test_size=0.1, seed=SEED)
train_ds, val_ds = splits["train"], splits["test"]

# ---------- Tokenizer & Base Model ----------
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# ensure pad token for decoder-only models
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# --


In [15]:
pip install -U "transformers>=4.31.0"


Collecting transformers>=4.31.0
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers>=4.31.0)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.4
    Uninstalling tokenizers-0.21.4:
      Successfully unins

In [5]:
# ===== LoRA SFT on user/assistant JSONL — legacy-friendly (no evaluation_strategy) =====
# - Uses plain HF Trainer (+ PEFT LoRA)
# - Completion-only loss (prompt masked with -100)
# - Manual eval AFTER training (no eval during training)

import json, random
from dataclasses import dataclass
from typing import List, Dict
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

SEED = 42
random.seed(SEED); torch.manual_seed(SEED)

# === Paths & hyperparams ===
DATA_JSONL = "/content/qa_clean_user_assistant.jsonl"   # <- your cleaned file (prompt, completion)
MODEL_NAME = "facebook/opt-350m"                        # try small first; swap later
OUTPUT_DIR = "/content/qa_lora_out"
MAX_LEN = 512
BATCH = 2
EPOCHS = 3
LR = 2e-4

# === Load & split dataset ===
ds = load_dataset("json", data_files=DATA_JSONL, split="train")
splits = ds.train_test_split(test_size=0.1, seed=SEED)
train_ds, val_ds = splits["train"], splits["test"]

# === Tokenizer & base model ===
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
base = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# === Apply LoRA (targets for OPT; adjust for other archs if needed) ===
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","out_proj","fc1","fc2"]
)
model = get_peft_model(base, lora_cfg)

# === PyTorch Dataset that masks the prompt (completion-only loss) ===
class ChatDataset(Dataset):
    def __init__(self, hf_split, tokenizer: AutoTokenizer, max_len: int):
        self.data = hf_split
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]
        prompt = ex["prompt"]               # e.g., "User: ...\nAssistant:"
        completion = ex["completion"]       # assistant text

        prompt_ids = self.tok(prompt, add_special_tokens=False)["input_ids"]
        # leading space helps separation
        completion_ids = self.tok(" " + completion, add_special_tokens=False)["input_ids"]

        input_ids = prompt_ids + completion_ids + [self.tok.eos_token_id]
        labels    = [-100]*len(prompt_ids) + completion_ids + [self.tok.eos_token_id]

        # left-truncate if too long (keep tail)
        if len(input_ids) > self.max_len:
            input_ids = input_ids[-self.max_len:]
            labels    = labels[-self.max_len:]

        attention_mask = [1]*len(input_ids)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

train_torch = ChatDataset(train_ds, tok, MAX_LEN)
val_torch   = ChatDataset(val_ds, tok, MAX_LEN)

# === Simple padding collator ===
@dataclass
class PadCollator:
    pad_id: int
    def __call__(self, batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        maxlen = max(x["input_ids"].size(0) for x in batch)
        def pad_vec(v, pad_val):
            pad_len = maxlen - v.size(0)
            if pad_len <= 0: return v
            return torch.cat([v, torch.full((pad_len,), pad_val, dtype=v.dtype)], dim=0)

        input_ids = torch.stack([pad_vec(x["input_ids"], self.pad_id) for x in batch])
        attention = torch.stack([pad_vec(x["attention_mask"], 0) for x in batch])
        labels    = torch.stack([pad_vec(x["labels"], -100) for x in batch])
        return {"input_ids": input_ids, "attention_mask": attention, "labels": labels}

collator = PadCollator(pad_id=tok.pad_token_id)

# === TrainingArguments (no evaluation_strategy used) ===
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    gradient_accumulation_steps=1,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=train_torch,
    eval_dataset=val_torch,   # we'll call trainer.evaluate() manually
)

# === Train ===
trainer.train()

# === Manual evaluation AFTER training ===
metrics = trainer.evaluate()   # uses val_torch
print("Eval metrics:", metrics)

# === Save LoRA adapters & tokenizer ===
trainer.model.save_pretrained(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)
print("Saved LoRA adapters to:", OUTPUT_DIR)

# === Minimal inference helper (loads base + adapters) ===
from peft import PeftModel
def chat(user_text: str, max_new_tokens=128, temperature=0.2):
    base = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    peft = PeftModel.from_pretrained(base, OUTPUT_DIR)
    peft.eval()
    prompt = f"User: {user_text}\nAssistant:"
    inputs = tok(prompt, return_tensors="pt").to(peft.device)
    with torch.no_grad():
        out = peft.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=temperature)
    text = tok.decode(out[0], skip_special_tokens=True)
    return text.split("Assistant:")[-1].strip()

print(chat("what is usuer asking"))


Step,Training Loss
20,2.4183
40,2.464
60,2.34
80,2.4735
100,2.3567
120,2.4022
140,2.3099
160,2.2081
180,2.25
200,2.1693


Eval metrics: {'eval_loss': 2.107499361038208, 'eval_runtime': 1.4201, 'eval_samples_per_second': 32.392, 'eval_steps_per_second': 16.196, 'epoch': 3.0}
Saved LoRA adapters to: /content/qa_lora_out
Usuer is asking for a password manager.


In [7]:
print(chat("How can I find AbdulMuiz Shaikh on LinkedIn?"))

You can find AbdulMuiz Shaikh on LinkedIn by searching for his name, LinkedIn profile, and LinkedIn profile link.


In [9]:
print(chat("What is the role of OpenAI's Whisper API in the context of autonumous vehicles"))

OpenAI's Whisper API is a tool that allows automakers to communicate with Autonumous vehicles via voice commands. This allows automakers to communicate with Autonumous vehicles via voice commands, such as "Hey, I'm driving a car." The Whisper API is used to communicate with Autonumous vehicles via voice commands, such as "Hey, I'm driving a car."


In [10]:
print(chat("What field does AbdulMuiz Shaikh work in?"))

AbdulMuiz Shaikh works in the field of Information Technology.
