In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_PEFT"] = "1"


In [None]:
!pip install -q \
  transformers -U \
  accelerate -U \
  datasets==2.19.1 \
  sentencepiece \
  bitsandbytes



In [None]:
from huggingface_hub import login
login(token="your HF login auth token here")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id="openai-community/gpt2"
tokenizer=AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.pad_token    = "[PAD]"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("[PAD]")
tokenizer.padding_side = "right"

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",          # Automatically uses GPU if available
    
    
)
model.resize_token_embeddings(len(tokenizer))


In [None]:
prompt="What is AMD's Pollara 400' and how does it relate to its server offerings?\nA:"


input=tokenizer(prompt,return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **input,   #parsing input
        max_new_tokens=50,  # size of max novel sequence generated
        do_sample=True,  #wether to make response random and novel
        temperature=0.7  #confidence level of the model in generating the answer
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# from google.colab import files

# uploaded = files.upload()


In [None]:
import json

# Load the JSON file
with open("/kaggle/input/data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Access the actual list under the key "amd_qa_data"
data = raw_data["amd_qa_data"]

# Print first 3 examples to verify
for i in range(3):
    print(f"Q: {data[i]['question']}")
    print(f"A: {data[i]['answer']}")
    print("-" * 40)


In [None]:
!pip install -q transformers nltk googletrans==4.0.0-rc1
!pip install -q transformers sentencepiece



In [None]:
import nltk
from nltk.corpus import wordnet, stopwords
import random

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").lower()
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replace(text, n=2):
    words = text.split()
    new_words = words.copy()
    indices = [i for i in range(len(words)) if words[i].lower() not in stop_words]
    random.shuffle(indices)
    
    replaced = 0
    for i in indices:
        synonyms = get_synonyms(words[i].lower())
        if synonyms:
            new_words[i] = random.choice(synonyms)
            replaced += 1
        if replaced >= n:
            break
    return " ".join(new_words)


In [None]:
from transformers import pipeline

translator_en_to_fr = pipeline(
    "translation_en_to_fr",
    model="Helsinki-NLP/opus-mt-en-fr",
    device_map="auto"
)
translator_fr_to_en = pipeline(
    "translation_fr_to_en",
    model="Helsinki-NLP/opus-mt-fr-en",
    device_map="auto"
)

def back_translate(text):
    fr = translator_en_to_fr(text, max_length=512)[0]["translation_text"]
    return translator_fr_to_en(fr, max_length=512)[0]["translation_text"]


In [None]:
augmented_data = []
count=0
for item in data:
    count=count+1
    q = item["question"]
    a = item["answer"]

    # Original
    augmented_data.append({"question": q, "answer": a})

    # Synonym replaced question
    aug_q = synonym_replace(q, n=2)
    augmented_data.append({"question": aug_q, "answer": a})

    # Back-translated answer
    aug_a = back_translate(a)
    augmented_data.append({"question": q, "answer": aug_a})

    # Both question + answer augmented
    aug_q2 = synonym_replace(q, n=1)
    aug_a2 = back_translate(a)
    augmented_data.append({"question": aug_q2, "answer": aug_a2})

    #Question converted to lowercase-
    aug_q3=q.lower();
    augmented_data.append({"question": aug_q3, "answer": a})

    if count%10==0:
        print(f"{count } iteration complete.")
        


In [None]:
import random
random.shuffle(augmented_data)

# Now split
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(augmented_data, test_size=0.1, random_state=42)


In [None]:
!ls /kaggle/input


In [None]:
# from sklearn.model_selection import train_test_split

# # Split the data into train and test sets
# train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

In [None]:
from datasets import Dataset, DatasetDict

# Convert lists to HuggingFace Datasets
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Group into a DatasetDict (Trainer likes this format)
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Peek at it
print(dataset)


In [None]:
def format_qa(example):
    return {
        "text": f"Q: {example['question']}\nA: {example['answer']}{tokenizer.eos_token}"
    }

# Apply formatting to both splits
formatted_dataset = dataset.map(format_qa)

# Preview first one
print(formatted_dataset["train"][1500])


In [None]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        padding=True,
        truncation=True
        
    )
    # Important: Do NOT manually create 'labels' here.
    # The DataCollatorForLanguageModeling (with mlm=False) will handle this.
    return tokens


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(["question", "answer", "text"])

# Now, inspect to confirm only numerical columns remain (and no 'labels' yet)
print("\n--- Inspecting tokenized_dataset after removing text columns ---")
first_example_keys = tokenized_dataset["train"][0].keys()
print(f"Keys in first example: {first_example_keys}")
# Expected: dict_keys(['input_ids', 'attention_mask']) or similar numerical ones.
# 'labels' should still NOT be here.

print(tokenized_dataset["train"][1100]["input_ids"]) # This should still work, showing token IDs



In [None]:
from transformers import TrainingArguments, TrainerCallback
from transformers import EarlyStoppingCallback

# Step 1: Modify training arguments
training_args = TrainingArguments(
    output_dir="./gemma-finetuned",   # where to save model
    eval_strategy="epoch",      # evaluate after every epoch
    save_strategy="epoch",        # Save a checkpoint after each epoch
    load_best_model_at_end=True,  # Load the model with the best validation loss at the end of training
    metric_for_best_model="eval_loss", # Monitor validation loss for early stopping (lower is better)
    logging_strategy="steps",         # log at each step
    logging_steps=10,                  # log every step
    learning_rate=1e-3,               # small learning rate (safe start)
    per_device_train_batch_size=2,    # Colab safe batch size
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=100,              # keep it low to test pipeline
    weight_decay=0.01,               # to avoid overfitting
    logging_dir="./logs",            # for TensorBoard
    push_to_hub=False,               # don't upload to HF hub
    # save_strategy="no",              # skip mid‑training checkpoints
    report_to="none",                 # don't report to external services
    lr_scheduler_type="linear",  # Use a cosine decay scheduler (good general choice)
    # warmup_ratio=0.1,

    save_total_limit=3, # Will only keep the 3 most recent checkpoints
)

# Step 2 (Optional): Add custom callback for printing
class LossLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"Step {state.global_step} - loss: {logs['loss']:.4f}")


In [None]:
# !pip uninstall peft -y
from transformers import DataCollatorForLanguageModeling, Trainer



# Helps batch variable-length input
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # causal LM
)
print("\n--- Inspecting DataCollatorForLanguageModeling output ---")
sample_batch_items = [tokenized_dataset["train"][i] for i in range(2)] # Take 2 samples
collated_batch = data_collator(sample_batch_items)

print(f"Collated batch keys: {collated_batch.keys()}") # Should now include 'labels'
print(f"Input IDs shape: {collated_batch['input_ids'].shape}")
print(f"Labels shape: {collated_batch['labels'].shape}")

print("\nFirst sample input_ids (first 20 tokens):")
print(collated_batch['input_ids'][0, :20])
print("\nFirst sample labels (first 20 tokens):")
print(collated_batch['labels'][0, :20])

# Expect: labels shifted left, -100 for pad tokens and potentially for the first token
# Example (simplified):
# input_ids:  [1, 2, 3, 4, 0, 0] (0=pad_token_id)
# labels   :  [-100, 1, 2, 3, 4, -100] (for mlm=False and padding handled)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[LossLoggerCallback(),EarlyStoppingCallback(early_stopping_patience=2)]    
)

# ✅ Start Training
trainer.train()


In [None]:
output_dir = "/kaggle/working/final_model"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "/kaggle/working/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu") # Or device_map="auto"


model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

from transformers import LogitsProcessorList, NoBadWordsLogitsProcessor

bad_words = [[t] for t in tokenizer.encode("A:", add_special_tokens=False)]
processor = LogitsProcessorList([NoBadWordsLogitsProcessor(bad_words_ids=bad_words)])


# Assuming model and tokenizer are loaded as before
# model_path = "./my_fine_tuned_gpt2_chatbot_model"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_chatbot_response(user_input, max_new_tokens=200):
    # Add special tokens if your fine-tuning used a specific format (e.g., "User: {input}\nBot:")
    # For a simple Q&A, you might just append the prompt
    prompt = f"Q: {user_input}\nA:"

    # Tokenize the prompt
    # This returns a BatchEncoding object, not directly a tensor
    input_encoding = tokenizer(prompt, return_tensors="pt")

    # Move the input_ids tensor to the correct device
    input_ids_tensor = input_encoding["input_ids"].to(model.device)
    attention_mask_tensor = input_encoding["attention_mask"].to(model.device) # Also pass attention_mask for better generation
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids_tensor, # Pass the actual input_ids tensor
            attention_mask=attention_mask_tensor, # Pass attention mask
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            logits_processor=processor
        )
    # Decode: outputs[0] is the full generated sequence including the prompt
    # We slice it to remove the prompt part: outputs[0][length_of_prompt_tokens:]
    response = tokenizer.decode(outputs[0][input_ids_tensor.shape[-1]:], skip_special_tokens=True).strip()
    return response

# Example usage:
print(generate_chatbot_response("What is Instinct series in AMD?"))

In [None]:
model_path = "/kaggle/working/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu") # Or device_map="auto"
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
from transformers import LogitsProcessorList, NoBadWordsLogitsProcessor

bad_words = [[t] for t in tokenizer.encode("A:", add_special_tokens=False)]
processor = LogitsProcessorList([NoBadWordsLogitsProcessor(bad_words_ids=bad_words)])

user_input="I want to play AAA games, what system should i get?"
prompt = f"Q: {user_input}\nA:"
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
input_length = input_ids["input_ids"].shape[1]  # Length of the prompt

with torch.no_grad():
    outputs = model.generate(
        **input_ids,
        max_new_tokens=500,
        
        # min_new_tokens=25,
        do_sample=True,
        temperature=0.2,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=10,    # reduce repetition
        logits_processor=processor,
        top_p=0.4
    )

# Slice off the prompt part
generated_tokens = outputs[0][input_length:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()


print(response)

In [None]:
# from within a notebook cell (prefix with ! for shell)
!zip -r final_model.zip /kaggle/working/final_model



In [None]:
trainer.push_to_hub("AMD-Bot")