In [1]:
import os
import tensorflow as tf
import PyPDF2
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import TFAutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, pipeline


from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments


from collections import Counter
import re
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [14]:
# 1️⃣ Load PDF & Extract Text
def extract_text_from_pdf(pdf_path, chunk_size=1000, chunk_overlap=200):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    return texts

# 2️⃣ Convert to Vector Embeddings
def create_vector_store(texts, top_k=5):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(texts, embeddings)
    vector_store.save_local("vector_db")
    return vector_store



# 3️⃣ Load Local LLM with TensorFlow Fine-Tuning
def load_fine_tuned_model(model_name="google/flan-t5-large", temperature=0.7, max_length=1024):
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)


    # Freeze all layers
    for var in model.trainable_variables:
        var._trainable = False

    # Unfreeze only the output layer (T5 uses lm_head for output)
    for var in model.lm_head.trainable_variables:
        var._trainable = True




    text_generator = pipeline("text2text-generation", model=model,
                              tokenizer=tokenizer,
                              temperature=temperature,
                              max_length=max_length,
                              top_p=0.9,
                              top_k=50)
    llm = HuggingFacePipeline(pipeline=text_generator)
    return llm

# 4️⃣ Create RAG-based QA Chain
def create_qa_pipeline(top_k=10):
    retriever = FAISS.load_local("vector_db", HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"), allow_dangerous_deserialization=True).as_retriever(search_kwargs={"k": 10})
    llm = load_fine_tuned_model(max_length=1024)

    #  Use a keyword-based retriever for better chapter matching

    bm25_retriever = BM25Retriever.from_documents(texts)  # Keyword-based retriever

    # Combine FAISS and BM25 for better results

    combined_retriever = EnsembleRetriever(retrievers=[retriever, bm25_retriever], weights=[0.8, 0.2])

    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=combined_retriever)
    return qa_chain


# Divide big file into small chunks
def iterative_summarization(texts, chunk_size=5):
    """Summarizes document in steps to handle long books."""
    partial_summaries = []
    for i in range(0, len(texts), chunk_size):
        chunk = texts[i:i + chunk_size]
        chunk_text = " ".join([doc.page_content for doc in chunk])  # Combine chunk content

        summary = qa_pipeline.run(f"Summarize: {chunk_text}")
        partial_summaries.append(summary)

    # Final summarization step on all summaries
    final_summary = qa_pipeline.run(f"Summarize: {' '.join(partial_summaries)}")
    return final_summary

def answer_question(query, qa_chain, texts):
    retrieved_docs = qa_chain.retriever.get_relevant_documents(query)  # 🔍 Fetch retrieved chunks

    #print("\n Retrieved Chunks:")
    #for i, doc in enumerate(retrieved_docs[:3]):  # Print top 3 retrieved documents
    #    print(f"\nChunk {i+1}:")
    #    print(doc.page_content[:500])  # Print first 500 characters of each chunk

    # 🔹 Run the query through RAG
    response = qa_chain.run(query)
    return response



# 5️⃣ Run Question-Answering
#def answer_question(query, qa_chain, texts):
#    response = qa_chain.run(query)
#    return response


In [17]:
if __name__ == "__main__":
    pdf_file = "Python for Data Analysis.pdf"

    # Load and process PDF
    texts = extract_text_from_pdf(pdf_file)
    create_vector_store(texts)

    # Create RAG pipeline
    qa_pipeline = create_qa_pipeline()

    # Ask the question
    question = "what is this book all about with example?"
    answer = answer_question(question, qa_pipeline, texts)

    print(f"Answer: {answer}")


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
Token indices sequence length is longer than the specified maximum sequence length for this model (2975 > 512). Running this sequence through the model will result in indexing errors


Answer: This book is concerned with the nuts and bolts of manipulating, processing, cleaning, and crunching data in Python.


In [121]:
if __name__ == "__main__":
    pdf_file = "Python for Data Analysis.pdf"

    # Extract text from PDF
    texts = extract_text_from_pdf(pdf_file)

    # Create vector store & retrieval pipeline
    create_vector_store(texts)
    qa_pipeline = create_qa_pipeline(top_k=20)  # Retrieve more context

    # Summarization
    question = "Summarize the whole document"
    summary = iterative_summarization(texts)  # Handle long books iteratively

    print(f"Final Summary: {summary}")


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
Token indices sequence length is longer than the specified maximum sequence length for this model (795 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

In [108]:
from itertools import product

# Define hyperparameter ranges
chunk_sizes = [100, 200]
chunk_overlaps = [20, 50]
top_ks = [5, 10]
temperatures = [0.5, 0.7]
max_lengths = [50, 200]

# Generate combinations
hyperparameter_combinations = list(product(chunk_sizes, chunk_overlaps, top_ks, temperatures, max_lengths))

for chunk_size, chunk_overlap, top_k, temp, max_len in hyperparameter_combinations:
    print(f"Testing: chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, top_k={top_k}, temp={temp}, max_len={max_len}")

    texts = extract_text_from_pdf("Grand.pdf", chunk_size, chunk_overlap)
    create_vector_store(texts, top_k)
    qa_pipeline = create_qa_pipeline()

    llm = load_fine_tuned_model(temperature=temp, max_length=max_len)  # ✅ Fixed function name
    answer = answer_question("summarize the document", qa_pipeline, texts)

    print(f"Answer: {answer}\n")

Testing: chunk_size=100, chunk_overlap=20, top_k=5, temp=0.5, max_len=10


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


Answer: Moong Dal HALWA GULAB JAMUN ROSE KHEER +91-9213695772 +91-9811090706 +91-9911189073 Mob: VEGETARIAN SILVER ASSORTED BREAD (ALL) TANDOORI ROTI BUTTER ROTI BUTTER NAAN LACHHA PARANTHA CHILLY SOYA CHAAP SMILIES (McCain) RICE (BASMATI) JEERA RICE SALAD (ALL) GARDEN GREEN SALAD SPROUTS SALAD PAPAD

Testing: chunk_size=100, chunk_overlap=20, top_k=5, temp=0.5, max_len=20


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


Answer: Moong Dal HALWA GULAB JAMUN ROSE KHEER +91-9213695772 +91-9811090706 +91-9911189073 Mob: VEGETARIAN SILVER ASSORTED BREAD (ALL) TANDOORI ROTI BUTTER ROTI BUTTER NAAN LACHHA PARANTHA CHILLY SOYA CHAAP SMILIES (McCain) RICE (BASMATI) JEERA RICE SALAD (ALL) GARDEN GREEN SALAD SPROUTS SALAD PAPAD

Testing: chunk_size=100, chunk_overlap=20, top_k=5, temp=0.7, max_len=10


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


KeyboardInterrupt: 

In [None]:
# 1️⃣ Load Pretrained Model and Tokenizer
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2️⃣ Apply LoRA Configuration for Fine-Tuning
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Task type (e.g., sequence-to-sequence)
    r=8,  # Rank of the low-rank approximation
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0.1  # Dropout rate for LoRA
)
model = get_peft_model(model, lora_config)

# 3️⃣ Load Dataset for Fine-Tuning
import pandas as pd
dataset = pd.read_csv('usercuisine.csv')
dataset = Dataset.from_pandas(dataset)

# 4️⃣ Tokenize the Dataset
def tokenize_function(examples):
    return tokenizer(examples['Rcuisine'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5️⃣ Set Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# 6️⃣ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# 7️⃣ Start Fine-Tuning
trainer.train()


def load_fine_tuned_model(model_path="./results"):
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    llm = HuggingFacePipeline(pipeline=text_generator)
    return llm



# New section