In [None]:
!pip install langchain transformers sentence-transformers faiss-cpu evaluate --quiet
!pip install -U langchain-community --quiet
!pip install -U datasets fsspec huggingface_hub --quiet
!pip install evaluate rouge_score --quiet

In [None]:
from datasets import load_dataset , Dataset
import os
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from evaluate import load
from random import sample
# from langchain.memory import ConversationSummaryMemory

# Zero-Shot vs Chain-of-Thought Prompting with Evaluation

## Step 1: Load and Prepare Dataset

In [None]:

dataset = load_dataset("squad_v2")

# def remove_duplicates(dataset):
#     seen = set()
#     unique_data = []
#     for item in dataset:
#         key = item['context']
#         if key not in seen:
#             seen.add(key)
#             unique_data.append(item)
#     return Dataset.from_list(unique_data)

# train_set = remove_duplicates(dataset["train"])
# val_set = remove_duplicates(dataset["validation"])

train_set = dataset["train"]
val_set = dataset["validation"]

print(f"Train set size (for FAISS): {len(train_set)}")
print(f"Validation set size (for testing): {len(val_set)}")


Train set size (for FAISS): 130319
Validation set size (for testing): 11873


## Step 2: Build FAISS Index from Train Set

In [None]:
# os.makedirs("data", exist_ok=True)
# with open("data/docs.txt", "w", encoding="utf-8") as f:
#     for item in train_set:
#         f.write(item['context'] + "\n\n")

docs = [Document(page_content=item["context"]) for item in train_set]
seen_texts = set()
unique_docs = []
for doc in docs:
    if doc.page_content not in seen_texts:
        unique_docs.append(doc)
        seen_texts.add(doc.page_content)
docs = unique_docs

print(f"Number of Contexts: {len(docs)}")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 1})


Number of Contexts: 19029


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Step 3: Load Model

In [None]:



model_name = "google/flan-t5-base"  #"google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512, device=0)
llm = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


## Step 4: Zero-Shot Prompting with Memory

In [None]:
memory_zs = ConversationBufferMemory(memory_key="chat_history",
                                     return_messages=True, k=5)
memory_zs.output_key = "answer"
zero_shot_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=retriever, memory=memory_zs,
    return_source_documents=True)

# memory = ConversationSummaryMemory(
#     llm=llm, memory_key="chat_history", return_messages=True
# )


  memory_zs = ConversationBufferMemory(memory_key="chat_history",


## Step 5: Chain-of-Thought Prompting with Memory

In [None]:
memory_cot = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory_cot.output_key = "answer"
cot_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=retriever, memory=memory_cot,
    return_source_documents=True)

## Step 6: Evaluate with ROUGE

In [None]:


stateless_chain_zs = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False
)

cot_prompt = PromptTemplate.from_template(
    """Question: {question}
Context {context}
Let's think step by step:
1. The question is asking about [identify what the question seeks].
2. The most relevant parts of the context appear to be [cite relevant text snippets].
3. These suggest the answer might be [preliminary hypothesis] because [reasoning].
4. After checking, this makes sense because [supporting evidence].
5. Alternative possibilities like [other options] are less likely because [reasons].

Final Answer: [concise answer directly from context]"""
)

stateless_chain_cot = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": cot_prompt}
)


random_indices = sample(range(len(train_set)), 5000)
val_eval = train_set.select(random_indices)

references, zs_preds, cot_preds = [], [], []

for item in val_eval:
    q = item["question"]
    a = item["answers"]["text"][0] if item["answers"]["text"] else "unanswerable"
    references.append(a)

    zs_out = stateless_chain_zs.run(q)
    cot_out = stateless_chain_cot.run(q)

    zs_preds.append(zs_out.strip())
    cot_preds.append(cot_out.strip())

  zs_out = stateless_chain_zs.run(q)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


In [None]:
rouge = load("rouge")

print("Zero-Shot Prompting Evaluation")
zs_rouge = rouge.compute(predictions=zs_preds, references=references)
print(zs_rouge)

print("\n Chain-of-Thought Prompting Evaluation")
cot_rouge = rouge.compute(predictions=cot_preds, references=references)
print(cot_rouge)


Zero-Shot Prompting Evaluation
{'rouge1': np.float64(0.3955183836129723), 'rouge2': np.float64(0.216390096911324), 'rougeL': np.float64(0.39495151876728785), 'rougeLsum': np.float64(0.3950223888670446)}

 Chain-of-Thought Prompting Evaluation
{'rouge1': np.float64(0.3306889086575711), 'rouge2': np.float64(0.1991289753366826), 'rougeL': np.float64(0.32931783550986715), 'rougeLsum': np.float64(0.32974949243576857)}


## Step 7: Interactive Testing with Memory

In [None]:

def chat_with_model(chain, mode="Zero-Shot"):
    print(f"\n[{mode} Mode] Start chatting (type 'exit' to quit):")
    if mode == "Zero-Shot":
        memory_zs.clear()
    else:
        memory_cot.clear()
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Exiting chat.")
            break
        if(mode == "Chain-of-Thought"):
            response = chain.invoke({"question": f"""Question: {user_input}

Let's think step by step:
1. The question is asking about [identify what the question seeks].
2. The most relevant parts of the context appear to be [cite relevant text snippets].
3. These suggest the answer might be [preliminary hypothesis] because [reasoning].
4. After checking, this makes sense because [supporting evidence].
5. Alternative possibilities like [other options] are less likely because [reasons].

Final Answer: [concise answer directly from context]"""})
        else:
            response = chain.invoke({"question": user_input}, return_only_outputs=True)
        print("Bot:", response["answer"])
        print("Sources:")
        for i, doc in enumerate(response["source_documents"]):
            print(f"Source {i}: {doc.page_content}")



In [None]:
chat_with_model(zero_shot_chain, mode="Zero-Shot")

In [None]:
chat_with_model(cot_chain, mode="Chain-of-Thought")

# Finetuning Eperiment:

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from evaluate import load

In [None]:
dataset = load_dataset("nq_open")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

README.md:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

In [None]:
len(eval_dataset)

3610

In [None]:
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess(example):
    question = example["question"]

    answer = example["answer"]
    if isinstance(answer, list):
        answer = answer[0] if len(answer) > 0 else "unanswerable"

    prompt = f"Answer the question based on your knowledge.\nQuestion: {question}"

    model_input = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=384
    )

    with tokenizer.as_target_tokenizer():
        target = tokenizer(
            answer,
            truncation=True,
            padding="max_length",
            max_length=32
        )

    model_input["labels"] = [
        token_id if token_id != tokenizer.pad_token_id else -100
        for token_id in target["input_ids"]
    ]

    return model_input




train_enc = train_dataset.map(preprocess, remove_columns=train_dataset.column_names, load_from_cache_file=False)
eval_enc = eval_dataset.map(preprocess, remove_columns=eval_dataset.column_names, load_from_cache_file=False)


Map:   0%|          | 0/87925 [00:00<?, ? examples/s]



Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

In [None]:
print(train_enc[0]["labels"])

[12225, 23, 15, 28196, 8220, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [None]:
from transformers import DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=None,
    padding=True
)


## Partial Finetuning:

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
for name, param in model.named_parameters():
    param.requires_grad = False
    if "decoder.block.10" in name or "decoder.block.11" in name or "lm_head" in name:
        print(name)
        param.requires_grad = True

args_partial = Seq2SeqTrainingArguments(
    output_dir="./nq_partial_finetune",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs_partial",
    report_to="none",
    generation_max_length=32,
    generation_num_beams=4,
    fp16=True
)
trainer_partial = Seq2SeqTrainer(
    model=model,
    args=args_partial,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer_partial.train()
trainer_partial.save_model("flan_nq_partial")


decoder.block.10.layer.0.SelfAttention.q.weight
decoder.block.10.layer.0.SelfAttention.k.weight
decoder.block.10.layer.0.SelfAttention.v.weight
decoder.block.10.layer.0.SelfAttention.o.weight
decoder.block.10.layer.0.layer_norm.weight
decoder.block.10.layer.1.EncDecAttention.q.weight
decoder.block.10.layer.1.EncDecAttention.k.weight
decoder.block.10.layer.1.EncDecAttention.v.weight
decoder.block.10.layer.1.EncDecAttention.o.weight
decoder.block.10.layer.1.layer_norm.weight
decoder.block.10.layer.2.DenseReluDense.wi_0.weight
decoder.block.10.layer.2.DenseReluDense.wi_1.weight
decoder.block.10.layer.2.DenseReluDense.wo.weight
decoder.block.10.layer.2.layer_norm.weight
decoder.block.11.layer.0.SelfAttention.q.weight
decoder.block.11.layer.0.SelfAttention.k.weight
decoder.block.11.layer.0.SelfAttention.v.weight
decoder.block.11.layer.0.SelfAttention.o.weight
decoder.block.11.layer.0.layer_norm.weight
decoder.block.11.layer.1.EncDecAttention.q.weight
decoder.block.11.layer.1.EncDecAttention

  trainer_partial = Seq2SeqTrainer(


Step,Training Loss
500,0.0


KeyboardInterrupt: 

## Full Finetuning:

In [None]:

model_full = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

args_full = Seq2SeqTrainingArguments(
    output_dir="./nq_full_finetune",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs_full",
    report_to="none",
    generation_max_length=32,
    generation_num_beams=4,
    fp16=True
)
trainer_full = Seq2SeqTrainer(
    model=model_full,
    args=args_full,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer_full.train()
trainer_full.save_model("flan_nq_full")


## Loading Existing finetuned models:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

def unzip_model(zip_path, extract_dir):
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

In [None]:
import os
print(os.getcwd())


/content


In [None]:
partial_zip = '/content/drive/MyDrive/NLP_Finetuning/nq_partial_finetune.zip'
full_zip = '/content/drive/MyDrive/NLP_Finetuning/nq_full_finetune.zip'

partial_dir = '/content/nq_partial_finetune'
full_dir = '/content/nq_full_finetune'

In [None]:
unzip_model('/content', partial_dir)
unzip_model('/content', full_dir)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(partial_dir)
model_partial = AutoModelForSeq2SeqLM.from_pretrained(partial_dir)
model_full = AutoModelForSeq2SeqLM.from_pretrained(full_dir)


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
import numpy as np
from evaluate import load
from transformers import Seq2SeqTrainingArguments

collator = DataCollatorForSeq2Seq(tokenizer)

dummy_args = Seq2SeqTrainingArguments(
    output_dir="./temp_output",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=32,
    generation_num_beams=4,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)


In [None]:
def evaluate_model(model, tokenizer, eval_dataset):
    trainer = Seq2SeqTrainer(
        model=model,
        args=dummy_args,
        tokenizer=tokenizer,
        data_collator=collator
    )
    preds = trainer.predict(eval_dataset)

    vocab_size = tokenizer.vocab_size
    safe_preds = np.clip(preds.predictions, 0, vocab_size - 1).astype(int)

    def clean_labels(label_ids):
        return [[token for token in seq if token != -100] for seq in label_ids]

    clean_refs = clean_labels(preds.label_ids)
    safe_refs = [np.clip(np.array(seq), 0, vocab_size - 1).astype(int) for seq in clean_refs]

    decoded_preds = tokenizer.batch_decode(safe_preds, skip_special_tokens=True)
    decoded_refs = tokenizer.batch_decode(safe_refs, skip_special_tokens=True)

    return decoded_preds, decoded_refs


In [None]:
decoded_preds_partial, decoded_refs = evaluate_model(model_partial, tokenizer, eval_enc)
decoded_preds_full, _ = evaluate_model(model_full, tokenizer, eval_enc)

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
rouge = load("rouge")

print("ROUGE (Partial Fine-Tuning):")
print(rouge.compute(predictions=decoded_preds_partial, references=decoded_refs))

print("\nROUGE (Full Fine-Tuning):")
print(rouge.compute(predictions=decoded_preds_full, references=decoded_refs))


ROUGE (Partial Fine-Tuning):
{'rouge1': np.float64(0.09145383766623917), 'rouge2': np.float64(0.03109583168447436), 'rougeL': np.float64(0.09079001795785932), 'rougeLsum': np.float64(0.09069277824090277)}

ROUGE (Full Fine-Tuning):
{'rouge1': np.float64(0.1157261674920953), 'rouge2': np.float64(0.045668667282240706), 'rougeL': np.float64(0.11539691533458823), 'rougeLsum': np.float64(0.11541059586765944)}
