In [1]:
!pip install llama-index llama-index-embeddings-huggingface peft optimum bitsandbytes kagglehub transformers accelerate llama-index-vector-stores-faiss faiss-gpu sentence-transformers




In [15]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.llms import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.llms import CompletionResponse, CompletionResponseGen
from llama_index.core import StorageContext
from tqdm import tqdm
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss
from accelerate import infer_auto_device_map, init_empty_weights
from sentence_transformers import SentenceTransformer
from llama_index.core.postprocessor import SentenceTransformerRerank

import pandas as pd
import glob
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In order to use a custom LLM, we need to define a class wrapper

In [3]:

from pydantic import PrivateAttr
from llama_index.core.llms import LLMMetadata
from llama_index.core.indices.prompt_helper import PromptHelper
from typing import ClassVar



class LocalLlamaLLM(LLM):
    _model: any = PrivateAttr()
    _tokenizer: any = PrivateAttr()
    _prompt_template: PromptTemplate = PrivateAttr()

    def __init__(self, model, tokenizer):
        super().__init__()
        self._model = model
        self._tokenizer = tokenizer
        self._prompt_template = PromptTemplate(
            "You are a helpful assistant with knowledge of the Harry Potter books. "
            "Answer the question using only the provided context. If the answer is not found, say you don’t know.\n\n"
            "### Context:\n{context}\n\n### Question:\n{query}\n\n### Answer:"
        )

    def complete(self, prompt: str, **kwargs) -> CompletionResponse:
        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._model.device)
        outputs = self._model.generate(**inputs, max_new_tokens=256)
        decoded = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = decoded.split("### Answer:")[-1].strip()
        return CompletionResponse(text=answer)

    def stream_complete(self, prompt: str, **kwargs) -> CompletionResponseGen:
        raise NotImplementedError()

    def chat(self, messages, **kwargs):
        raise NotImplementedError()

    def stream_chat(self, messages, **kwargs):
        raise NotImplementedError()

    async def acomplete(self, prompt: str, **kwargs):
        raise NotImplementedError()

    async def astream_complete(self, prompt: str, **kwargs):
        raise NotImplementedError()

    async def achat(self, messages, **kwargs):
        raise NotImplementedError()

    async def astream_chat(self, messages, **kwargs):
        raise NotImplementedError()

    @property
    def metadata(self) -> LLMMetadata:
        return LLMMetadata(
            context_window=2048,
            num_output=256,
            is_chat_model=False,
            model_name="meta-llama/Llama-3.2-1B-Instruct"
        )


In [17]:
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda")
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [5]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto"
)


Settings.llm = LocalLlamaLLM(model=model, tokenizer=tokenizer)
Settings.chunk_size = 256
Settings.chunk_overlap = 25

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rupanshukapoor/harry-potter-books")

print("Path to dataset files:", path)
all_hp_files_path = "/root/.cache/kagglehub/datasets/rupanshukapoor/harry-potter-books/versions/1/hp_books1-7/hp_books/"

Path to dataset files: /root/.cache/kagglehub/datasets/rupanshukapoor/harry-potter-books/versions/1


In [17]:
# ls /kaggle/input/harry-potter-books/hp_books1-7/hp_books/

Book1.txt  Book2.txt  Book3.txt  Book4.txt  Book5.txt  Book6.txt  Book7.txt


In [7]:
import os
import re
import pandas as pd
import unicodedata
from difflib import get_close_matches


chapters = {
    "Book1": ["THE BOY WHO LIVED", "THE VANISHING GLASS", "THE LETTERS FROM NO ONE", "THE KEEPER OF THE KEYS", "DIAGON ALLEY", "THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS", "THE SORTING HAT", "THE POTIONS MASTER", "THE MIDNIGHT DUEL", "HALLOWEEN", "QUIDDITCH", "THE MIRROR OF ERISED", "NICOLAS FLAMEL", "NORBERT THE NORWEGIAN RIDGEBACK", "THE FORBIDDEN FOREST", "THROUGH THE TRAPDOOR", "THE MAN WITH TWO FACES"],

    "Book2": ["THE WORST BIRTHDAY", "DOBBY’S WARNING", "THE BURROW", "AT FLOURISH AND BLOTTS", "THE WHOMPING WILLOW", "GILDEROY LOCKHART", "MUDBLOODS AND MURMURS", "THE DEATHDAY PARTY", "THE WRITING ON THE WALL", "THE ROGUE BLUDGER", "THE DUELLING CLUB", "THE POLYJUICE POTION", "THE VERY SECRET DIARY", "CORNELIUS FUDGE", "ARAGOG", "THE CHAMBER OF SECRETS", "THE HEIR OF SLYTHERIN", "DOBBY’S REWARD"],

    "Book3": ["OWL POST", "AUNT MARGE’S BIG MISTAKE", "THE KNIGHT BUS", "THE LEAKY CAULDRON", "THE DEMENTOR", "TALONS AND TEA LEAVES", "THE BOGGART IN THE WARDROBE", "FLIGHT OF THE FAT LADY", "GRIM DEFEAT", "THE MARAUDER’S MAP", "THE FIREBOLT", "THE PATRONUS", "GRYFFINDOR VERSUS RAVENCLAW", "SNAPE’S GRUDGE", "THE QUIDDITCH FINAL", "PROFESSOR TRELAWNEY’S PREDICTION", "CAT, RAT AND DOG", "MOONY, WORMTAIL, PADFOOT AND PRONGS", "THE SERVANT OF LORD VOLDEMORT", "THE DEMENTOR’S KISS", "HERMIONE’S SECRET", "OWL POST AGAIN"],

    "Book4": ["THE RIDDLE HOUSE", "THE SCAR", "THE INVITATION", "BACK TO THE BURROW", "WEASLEYS’ WIZARD WHEEZES", "THE PORTKEY", "BAGMAN AND CROUCH", "THE QUIDDITCH WORLD CUP", "THE DARK MARK", "MAYHEM AT THE MINISTRY", "ABOARD THE HOGWARTS EXPRESS", "THE TRIWIZARD TOURNAMENT", "MAD-EYE MOODY", "THE UNFORGIVABLE CURSES", "BEAUXBATONS AND DURMSTRANG", "THE GOBLET OF FIRE", "THE FOUR CHAMPIONS", "THE WEIGHING OF THE WANDS", "THE HUNGARIAN HORNTAIL", "THE FIRST TASK", "THE HOUSE-ELF LIBERATION FRONT", "THE UNEXPECTED TASK", "THE YULE BALL", "RITA SKEETER’S SCOOP", "THE EGG AND THE EYE", "THE SECOND TASK", "PADFOOT RETURNS", "THE MADNESS OF MR CROUCH", "THE DREAM", "THE PENSIEVE", "THE THIRD TASK", "FLESH, BLOOD AND BONE", "THE DEATH EATERS", "PRIORI INCANTATEM", "VERITASERUM", "THE PARTING OF THE WAYS", "THE BEGINNING"],

    "Book5": ["DUDLEY DEMENTED", "A PECK OF OWLS", "THE ADVANCE GUARD", "NUMBER TWELVE, GRIMMAULD PLACE", "THE ORDER OF THE PHOENIX", "THE NOBLE AND MOST ANCIENT HOUSE OF BLACK", "THE MINISTRY OF MAGIC", "THE HEARING", "THE WOES OF MRS WEASLEY", "LUNA LOVEGOOD", "THE SORTING HAT’S NEW SONG", "PROFESSOR UMBRIDGE", "DETENTION WITH DOLORES", "PERCY AND PADFOOT", "THE HOGWARTS HIGH INQUISITOR", "IN THE HOG’S HEAD", "EDUCATIONAL DECREE NUMBER TWENTY-FOUR", "DUMBLEDORE’S ARMY", "THE LION AND THE SERPENT", "HAGRID’S TALE", "THE EYE OF THE SNAKE", "ST MUNGO’S HOSPITAL FOR MAGICAL MALADIES AND INJURIES", "CHRISTMAS ON THE CLOSED WARD", "OCCLUMENCY", "THE BEETLE AT BAY", "SEEN AND UNFORESEEN", "THE CENTAUR AND THE SNEAK", "SNAPE’S WORST MEMORY", "CAREERS ADVICE", "GRAWP", "O.W.L.S", "OUT OF THE FIRE", "FIGHT AND FLIGHT", "THE DEPARTMENT OF MYSTERIES", "BEYOND THE VEIL", "THE ONLY ONE HE EVER FEARED", "THE LOST PROPHECY", "THE SECOND WAR BEGINS"],

    "Book6": ["THE OTHER MINISTER", "SPINNER’S END", "WILL AND WON’T", "HORACE SLUGHORN", "AN EXCESS OF PHLEGM", "DRACO’S DETOUR", "THE SLUG CLUB", "SNAPE VICTORIOUS", "THE HALF-BLOOD PRINCE", "THE HOUSE OF GAUNT", "HERMIONE’S HELPING HAND", "SILVER AND OPALS", "THE SECRET RIDDLE", "FELIX FELICIS", "THE UNBREAKABLE VOW", "A VERY FROSTY CHRISTMAS", "A SLUGGISH MEMORY", "BIRTHDAY SURPRISES", "ELF TAILS", "LORD VOLDEMORT’S REQUEST", "THE UNKNOWABLE ROOM", "AFTER THE BURIAL", "HORCRUXES", "SECTUMSEMPRA", "THE SEER OVERHEARD", "THE CAVE", "THE LIGHTNING-STRUCK TOWER", "FLIGHT OF THE PRINCE", "THE PHOENIX LAMENT", "THE WHITE TOMB"],

    "Book7": ["THE DARK LORD ASCENDING", "IN MEMORIAM", "THE DURSLEYS DEPARTING", "THE SEVEN POTTERS", "FALLEN WARRIOR", "THE GHOUL IN PYJAMAS", "THE WILL OF ALBUS DUMBLEDORE", "THE WEDDING", "A PLACE TO HIDE", "KREACHER’S TALE", "THE BRIBE", "MAGIC IS MIGHT", "THE MUGGLE-BORN REGISTRATION COMMISSION", "THE THIEF", "THE GOBLIN’S REVENGE", "GODRIC’S HOLLOW", "BATHILDA’S SECRET", "THE LIFE AND LIES OF ALBUS DUMBLEDORE", "THE SILVER DOE", "XENOPHILIUS LOVEGOOD", "THE TALE OF THE THREE BROTHERS", "THE DEATHLY HALLOWS", "MALFOY MANOR", "THE WANDMAKER", "SHELL COTTAGE", "GRINGOTTS", "THE FINAL HIDING PLACE", "THE MISSING MIRROR", "THE LOST DIADEM", "THE SACKING OF SEVERUS SNAPE", "THE BATTLE OF HOGWARTS", "THE ELDER WAND", "KING’S CROSS", "THE FLAW IN THE PLAN"]
}


def normalize(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("’", "'").replace("‘", "'")
    text = text.replace("“", '"').replace("”", '"')
    text = text.replace("–", "-").replace("—", "-")
    return re.sub(r'\s+', ' ', text.strip().upper())

chapter_title_pattern = re.compile(r"\n{1,}([A-Z0-9 ,'’\-]{5,})\n{1,}")

all_data = []

for book_key, official_titles in chapters.items():
    filename = f"{book_key}.txt"
    file_path = os.path.join(all_hp_files_path, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    cleaned_text = ' '.join([s.strip() for s in raw_text.split('|') if s.strip()])
    matches = list(re.finditer(chapter_title_pattern, cleaned_text))

    normalized_officials = [normalize(t) for t in official_titles]
    chapters_found = []

    for i, match in enumerate(matches):
        raw_title = match.group(1).strip()
        norm_title = normalize(raw_title)
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(cleaned_text)

        # ✅ Only accept the match if it closely resembles an official chapter
        matched = get_close_matches(norm_title, normalized_officials, n=1, cutoff=0.8)
        if matched:
            matched_title = official_titles[normalized_officials.index(matched[0])]
            text_block = cleaned_text[start:end].strip()

            # Strip duplicate title from the start of the text
            if text_block.upper().startswith(raw_title.upper()):
                text_block = text_block[len(raw_title):].strip()

            chapters_found.append({
                "book_title": book_key,
                "chapter_number": len(chapters_found) + 1,
                "chapter_title": matched_title,
                "text": text_block
            })
        # else:
            # print(f" Ignoring non-matching title in {book_key}: {raw_title}")

    all_data.extend(chapters_found)

# Save to file
df = pd.DataFrame(all_data)
df.to_csv("harry_potter_chapters_filtered.csv", index=False)
df.head()


Unnamed: 0,book_title,chapter_number,chapter_title,text
0,Book1,1,THE BOY WHO LIVED,"Mr. and Mrs. Dursley, of number four, Privet D..."
1,Book1,2,THE VANISHING GLASS,Nearly ten years had passed since the Dursleys...
2,Book1,3,THE LETTERS FROM NO ONE,The escape of the Brazilian boa constrictor ea...
3,Book1,4,THE KEEPER OF THE KEYS,BOOM. They knocked again. Dudley jerked awake....
4,Book1,5,DIAGON ALLEY,Harry woke early the next morning. Although he...


In [8]:
print(len(chapters["Book6"]))
print(len(df[df['book_title']=='Book6']))

30
30


In [9]:
def chunk_text(text, max_words=200, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words - overlap):
        chunk = ' '.join(words[i:i + max_words])
        if chunk:
            chunks.append(chunk)
    return chunks


In [10]:
documents = []

for idx, row in df.iterrows():
    chapter_chunks = chunk_text(row["text"])
    for chunk in chapter_chunks:
        documents.append(
            Document(
                text=chunk,
                metadata={
                    "book_title": row["book_title"],
                    "chapter_number": row["chapter_number"],
                    "chapter_title": row["chapter_title"]
                }
            )
        )

print(documents[0].text)
print(documents[0].metadata)

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursley s had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t Page 2 Harry Potter 

In [11]:
texts = [doc.text for doc in documents]

# Batch embed — super fast on GPU
batch_size = 128
embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
    batch = texts[i:i + batch_size]
    embs = embedding_model.encode(batch, device="cuda", normalize_embeddings=True)
    embeddings.extend(embs)


Embedding: 100%|██████████| 50/50 [00:07<00:00,  6.25it/s]


In [18]:
import numpy as np

faiss_index = faiss.IndexFlatL2(384)  

faiss.write_index(faiss_index, "./harry_potter_faiss/improved.index")
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
# vector_index = VectorStoreIndex.from_documents(documents)  

In [19]:
vector_index.storage_context.persist(persist_dir="./harry_potter_faiss")

In [20]:
# # Loading Stored Index
# from llama_index.core import load_index_from_storage

# import faiss
# faiss_index = faiss.read_index("./harry_potter_faiss/improved.index")

# # Step 2: Build FaissVectorStore with the loaded index
# vector_store = FaissVectorStore(faiss_index=faiss_index)

# # Step 3: Create storage context with the custom vector store
# storage_context = StorageContext.from_defaults(
#     persist_dir="./harry_potter_faiss",
#     vector_store=vector_store
# )

# # Step 4: Load index
# vector_index = load_index_from_storage(storage_context)


In [21]:
retriever = vector_index.as_retriever(similarity_top_k=5)

In [22]:
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.44)
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[postprocessor]
)

In [23]:
response = query_engine.query("What spell did Harry use against the Dementors?")
print("Answer:", response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Answer: Context information is below.
---------------------
book_title: Book3
chapter_number: 11
chapter_title: THE MARAUDER’S MAP

“Black must have found a way to fight them. I wouldn’t have believed it possible.... Dementors are supposed to drain a wizard of his powers if he is left with them too long. “You made that dementor on the train back off,” said Harry suddenly. “There are — certain defenses one can use,” said Lupin. “But there was only one dementor on the train. The more there are, the more difficult it becomes to resist.” “What defenses?” said Harry at once. “Can you teach me?” “I don’t pretend to be an expert at fighting dementors, Harry... quite the contrary....” “But if the dementors come to another Quidditch match, I need to be able to fight them — ” Lupin looked into Harry’s determined face, hesitated, then said, “Well... all right. I’ll try and help. But it’ll have to wait until next term, I’m afraid. I have a lot to do before the holidays.

book_title: Book5
chapter_

### Next Steps for Further Improvement: 
1. Make Chunking better - so as to pull in better pieces of data to send to the model - like snorkel style adaptive chunking which really helps identify the change of scene, the change of tone - would be really useful here.
2. Improve Embeddings by fine-tuning them on your current dataset to be able to correctly map them in the right vector space.
3. Improve Retriever by setting reranking for the chunks - A reranker can re-score them with semantic understanding.
4. Use Query Augumentation - instead of asking random queries - auto-rephrasing with the LLM to rewrite the query into a more search-optimized form.
   

In [24]:
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",  # fast and effective
    top_n=3  # return only the top N after reranking
)


You are trying to use a model that was created with Sentence Transformers version 4.1.0.dev0, but you're currently using version 4.0.2. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


In [25]:
# postprocessor = SimilarityPostprocessor(similarity_cutoff=0.44)
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[postprocessor,reranker]
)

In [26]:
response = query_engine.query("What spell did Harry use against the Dementors?")
print("Answer:", response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Answer: Context information is below.
---------------------
book_title: Book3
chapter_number: 11
chapter_title: THE MARAUDER’S MAP

“Black must have found a way to fight them. I wouldn’t have believed it possible.... Dementors are supposed to drain a wizard of his powers if he is left with them too long. “You made that dementor on the train back off,” said Harry suddenly. “There are — certain defenses one can use,” said Lupin. “But there was only one dementor on the train. The more there are, the more difficult it becomes to resist.” “What defenses?” said Harry at once. “Can you teach me?” “I don’t pretend to be an expert at fighting dementors, Harry... quite the contrary....” “But if the dementors come to another Quidditch match, I need to be able to fight them — ” Lupin looked into Harry’s determined face, hesitated, then said, “Well... all right. I’ll try and help. But it’ll have to wait until next term, I’m afraid. I have a lot to do before the holidays.

book_title: Book3
chapter_

In [125]:
# def optimize_query(query, model):
#     prompt = f"""
#     You are a query optimizer for a Harry Potter question answering system.
#     Given a user query, rewrite it to be more specific and helpful for a retrieval system.
#     Only return the improved query.
    
#     User Query: "{query}"
#     Improved Query:
#     """
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     outputs = model.generate(**inputs, max_new_tokens=64)
#     optimized_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

#     return optimized_query.split("Improved Query:")[-1].strip()


In [129]:
# optimized = optimize_query(query, model)
# print("Optimized Query:", optimized)

In [130]:
# response = query_engine.query(optimize_query("How many times did Harry fight Voldemort?", model))
# print("Answer:", response)

In [None]:
# custom_prompt = PromptTemplate(
#     "You are a helpful assistant with deep knowledge of the Harry Potter universe.\n"
#     "Use the following context to answer the user's question.\n"
#     "If the answer isn't in the context, say you don't know.\n\n"
#     "### Context:\n{context_str}\n\n### Question:\n{query_str}\n\n### Answer:"
# )
