In [1]:
# Installation and Imports
!pip -q install langchain langchain-community langchain-text-splitters
!pip -q install pypdf pymupdf sentence-transformers
!pip -q install numpy pandas tqdm faiss-cpu
!pip -q install instructor pydantic transformers accelerate torch

# Imports
from pathlib import Path
import json, re, pickle
from tqdm import tqdm
from typing import List, Dict, Any

# LangChain imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# LLM imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from instructor import patch

# Pydantic for validation
from pydantic import BaseModel, Field, field_validator

# Optional: keep CPU thread usage sane in notebooks
if not torch.cuda.is_available():
    torch.set_num_threads(2)


[0m

In [1]:
# Configuration
BOOK_PDF_PATH = Path("../Sources/book.pdf")
SECTIONS_JSON_PATH = Path("./working/page_to_section.json")

# Verify files exist
assert BOOK_PDF_PATH.exists(), f"PDF not found: {BOOK_PDF_PATH}"
assert SECTIONS_JSON_PATH.exists(), f"Sections JSON not found: {SECTIONS_JSON_PATH}"

# Load sections mapping
print("Loading sections mapping...")
with open(SECTIONS_JSON_PATH, "r", encoding="utf-8") as f:
    sections_map = json.load(f)

# Ensure all keys are strings
sections_map = {str(k): v for k, v in sections_map.items()}
print(f"Loaded {len(sections_map)} page-to-section mappings")
def clean_page_text(text: str) -> str:
    """Light cleaning for a single page string."""
    if not text: 
        return ""
    
    # Remove common boilerplate text
    text = re.sub(r'Access for free at openstax\.org\.*', '', text)
    text = re.sub(r'LINK TO LEARNING.*?(?=\n|$)', '', text)  # Remove LINK TO LEARNING sections
    text = re.sub(r'Watch a brief video.*?(?=\n|$)', '', text)  # Remove video references
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)  # Remove URLs
    
    # Basic text normalization
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"-\n(\w)", r"\1", text)       # de-hyphenate linebreaks
    text = re.sub(r"[ \t]+\n", "\n", text)       # strip trailing spaces before newline
    text = re.sub(r"[ \t]{2,}", " ", text)       # collapse multi-spaces
    
    # Remove multiple consecutive newlines
    text = re.sub(r'\n\s*\n', '\n', text)
    
    return text.strip()

def split_into_word_windows(text: str, size: int, overlap: int) -> List[str]:
    """Split ONE page into overlapping word windows. Returns list of strings."""
    words = text.replace("\n", " ").split()
    if not words:
        return []
    
    chunks = []
    start_idx = 0
    
    while start_idx < len(words):
        # Take size words for this chunk
        chunk = words[start_idx:start_idx + size]
        if chunk:
            chunks.append(" ".join(chunk))
        
        # Move start_idx forward by (size - overlap) to create overlap
        start_idx += (size - overlap)
        
        # If we can't make a full chunk anymore, break
        if start_idx + size > len(words):
            # Add final chunk if there are remaining words
            remaining = words[start_idx:]
            if remaining:
                chunks.append(" ".join(remaining))
            break
            
    return chunks

def make_chunk_id(physical_page: int, idx: int) -> str:
    """Make a stable chunk id like 'p19_w003' that preserves page number."""
    return f"p{physical_page}_w{idx:03d}"

def process_pdf(pdf_path: str, 
               sections_map: dict,
               start_page: int = 19,    # Skip front matter
               end_page: int = 638,     # Stop before references
               logical_offset: int = 12,
               chunk_size: int = 220,    
               chunk_overlap: int = 40    
               ) -> List[Document]:
    """Process PDF while maintaining page boundaries and creating overlapping chunks."""
    
    loader = PyPDFLoader(str(pdf_path))
    docs_all = loader.load()
    
    # Filter pages between start_page and end_page
    page_docs = [d for d in docs_all if start_page <= int(d.metadata.get("page", 0)) + 1 <= end_page]
    chunks = []
    
    for d in tqdm(page_docs, desc="Processing pages"):
        physical_page = int(d.metadata.get("page", 0)) + 1
        logical_page = physical_page - logical_offset
        section = sections_map.get(str(logical_page), "")
        
        page_text = clean_page_text(d.page_content)
        windows = split_into_word_windows(
            text=page_text,
            size=chunk_size,
            overlap=chunk_overlap
        )
        
        for i, window in enumerate(windows):
            metadata = {
                "chunk_id": make_chunk_id(physical_page, i),
                "physical_page": physical_page,
                "logical_page": logical_page,
                "section": section,
                "source": str(pdf_path),
                "chunk_size_words": chunk_size,
                "chunk_overlap_words": chunk_overlap,
            }
            chunks.append(Document(page_content=window, metadata=metadata))
            
    return chunks

# Usage
chunks = process_pdf(
    pdf_path=BOOK_PDF_PATH,
    sections_map=sections_map,
    start_page=19,   # Skip front matter
    end_page=638    # Stop before references
)

print(f"Total chunks created: {len(chunks)}")
for c in chunks[:5]:
    print("-" * 60)
    print(f"id     : {c.metadata['chunk_id']}")
    print(f"page   : phys {c.metadata['physical_page']} | logical {c.metadata['logical_page']}")
    print(f"section: {c.metadata['section'] or '(none)'}")
    print(f"text   : {c.page_content}...")

NameError: name 'Path' is not defined

In [3]:
# STEP 3: EMBEDDINGS AND VECTOR STORE
"""
In this step, we create a semantic search index for our text chunks:

1. Create embeddings using HuggingFace's sentence-transformers
   - Using all-MiniLM-L6-v2: Efficient model that works well on CPU
   - Normalized embeddings for better cosine similarity

2. Build FAISS index for fast similarity search
   - FAISS is efficient for large-scale similarity search
   - Works well on CPU for our dataset size

3. Save index for reuse
   - Persists both the FAISS index and metadata
   - Allows quick reloading without recomputing embeddings
"""

# Configuration
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Efficient CPU model
INDEX_SAVE_PATH = Path("./artifacts/faiss_index")                # Where to save the index
INDEX_SAVE_PATH.mkdir(parents=True, exist_ok=True)

# Initialize embedding model
print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cpu"},           # Force CPU - works well for this model
    encode_kwargs={"normalize_embeddings": True}  # Better for cosine similarity
)

# Create FAISS index from our chunks
print("\nBuilding FAISS index from chunks...")
vectorstore = FAISS.from_documents(
    documents=chunks,           # Our preprocessed text chunks
    embedding=embeddings       # The embedding model
)

# Save index for reuse
print(f"\nSaving index to {INDEX_SAVE_PATH}...")
vectorstore.save_local(str(INDEX_SAVE_PATH))

# Quick verification of saved index
print("\nVerifying saved index...")
reloaded_vectorstore = FAISS.load_local(
    folder_path=str(INDEX_SAVE_PATH),
    embeddings=embeddings,
    allow_dangerous_deserialization=True  # Required in notebooks
)

# Test a simple query to verify everything works
test_query = "What is psychology?"
test_results = reloaded_vectorstore.similarity_search(
    query=test_query,
    k=2  # Get top 2 results
)

print("\nTest Results:")
print(f"Query: {test_query}")
for i, doc in enumerate(test_results, 1):
    print(f"\nResult {i}:")
    print(f"Page: {doc.metadata['physical_page']} (logical: {doc.metadata['logical_page']})")
    print(f"Section: {doc.metadata['section']}")
    print(f"Text: {doc.page_content[:200]}...")

  embeddings = HuggingFaceEmbeddings(


Loading embedding model...

Building FAISS index from chunks...

Saving index to artifacts/faiss_index...

Verifying saved index...

Test Results:
Query: What is psychology?

Result 1:
Page: 20 (logical: 8)
Section: introduction_to_psychology/what_is_psychology
Text: understanding of the mind is so limited, since thoughts, at least as we experience them, are neither matter nor energy. The scientific method is also a form of empiricism. An empirical method for acqu...

Result 2:
Page: 20 (logical: 8)
Section: introduction_to_psychology/what_is_psychology
Text: 2002). Nash was the subject of the 2001 movie A Beautiful Mind. Why did these people have these experiences? How does the human brain work? And what is the connection between the brain’s internal proc...


In [4]:
# STEP 4: PROCESS ALL QUERIES AND GENERATE SUBMISSION.CSV
"""
Process all queries from queries.json and save results to submission.csv with proper format:
- ID: Query identifier
- context: Retrieved context from textbook (top 3 chunks combined)
- answer: Generated answer based on the context
- references: JSON with sections and pages

Filter chunks to only include pages mentioned in page_to_section.json (with offset of 12)
"""

import json
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from tqdm import tqdm

# Load page to section mapping
page_to_section_path = Path("./working/page_to_section.json")
print(f"Loading page to section mapping from {page_to_section_path}...")

with open(page_to_section_path, "r", encoding="utf-8") as f:
    page_to_section = json.load(f)

# Convert to physical page numbers (add offset of 12)
valid_physical_pages = set()
for logical_page_str, section in page_to_section.items():
    logical_page = int(logical_page_str)
    physical_page = logical_page + 12  # Add offset
    valid_physical_pages.add(physical_page)

print(f"Loaded mapping for {len(valid_physical_pages)} valid physical pages")

# Load queries from queries.json
queries_path = Path("../Sources/queries.json")
print(f"Loading queries from {queries_path}...")

with open(queries_path, "r", encoding="utf-8") as f:
    queries_data = json.load(f)

print(f"Loaded {len(queries_data)} queries")

# Reload FAISS index and model
print("\nReloading FAISS index...")
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.load_local(
    folder_path="artifacts/faiss_index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

# Load lightweight model for answer generation  
print("\nLoading FLAN-T5-small with optimized settings...")
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # Use float32 for stability
    low_cpu_mem_usage=True
)
model.eval()

# Free up memory
import gc
torch.cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

def get_filtered_chunks_for_query(query: str, k: int = 10) -> List[Dict[str, Any]]:
    """Get top k chunks for a single query with hybrid reranking like the competition winner."""
    # Get more chunks initially for reranking
    chunks_and_scores = vectorstore.similarity_search_with_score(
        query=query,
        k=k * 3  # Get more candidates for reranking
    )
    
    # Filter to only include valid pages and format results
    filtered_chunks = []
    for doc, score in chunks_and_scores:
        physical_page = doc.metadata["physical_page"]
        
        # Only include chunks from valid pages
        if physical_page in valid_physical_pages:
            # Get section from page_to_section mapping
            logical_page = physical_page - 12  # Remove offset
            section = page_to_section.get(str(logical_page), "")
            
            chunk_info = {
                "content": doc.page_content.strip(),
                "score": float(score),
                "metadata": {
                    "physical_page": physical_page,
                    "logical_page": logical_page,
                    "section": section,
                    "chunk_id": doc.metadata.get("chunk_id", "")
                }
            }
            filtered_chunks.append(chunk_info)
    
    # Apply hybrid reranking (like the competition winner)
    reranked_chunks = rerank_chunks_hybrid(filtered_chunks, query)
    
    return reranked_chunks[:k]  # Return top k reranked chunks

def rerank_chunks_hybrid(chunks: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
    """
    Rerank chunks using hybrid approach: semantic similarity + keyword overlap
    (Similar to the competition winner's strategy)
    """
    query_terms = set(query.lower().split())
    
    for chunk in chunks:
        # Calculate keyword overlap
        content_terms = set(chunk["content"].lower().split())
        keyword_overlap = len(query_terms & content_terms)
        
        # Combine semantic score with keyword overlap
        semantic_score = 1.0 - chunk["score"]  # Convert distance to similarity
        keyword_score = keyword_overlap / max(len(query_terms), 1)
        
        # Weighted combination (60% semantic, 40% keyword)
        chunk["hybrid_score"] = (0.6 * semantic_score) + (0.4 * keyword_score)
        chunk["keyword_overlap"] = keyword_overlap
    
    # Sort by hybrid score (higher is better)
    chunks.sort(key=lambda x: x["hybrid_score"], reverse=True)
    
    return chunks

def format_context(chunks: List[Dict[str, Any]]) -> str:
    """Format chunks into context string without page numbers."""
    context_parts = []
    for chunk in chunks:
        content = chunk["content"]
        context_parts.append(content)
    
    return " ".join(context_parts)

def format_references(chunks: List[Dict[str, Any]]) -> str:
    """Format references as JSON string with sections and pages."""
    # Extract unique sections and pages
    sections = []
    pages = []
    
    for chunk in chunks:
        section = chunk["metadata"]["section"]
        page = str(chunk["metadata"]["physical_page"])
        
        if section and section not in sections:
            sections.append(section)
        if page not in pages:
            pages.append(page)
    
    # Sort for consistency
    sections.sort()
    pages.sort(key=int)  # Sort numerically
    
    references = {
        "sections": sections,
        "pages": pages
    }
    
    return json.dumps(references)

# Simplified approach - no complex template needed
def extract_answer_from_context(question: str, context: str) -> str:
    """
    Extract answer using improved strategy similar to competition winner.
    Combines semantic similarity with better sentence selection.
    """
    try:
        # Clean and split context into sentences
        sentences = []
        for sent in re.split(r'[.!?]+', context):
            sent = sent.strip()
            if len(sent) > 15:  # Filter out short fragments
                sentences.append(sent)
        
        if not sentences:
            return "No relevant information found in the context."
        
        # Extract question keywords (improved stopword filtering)
        question_lower = question.lower()
        stop_words = {
            'what', 'how', 'why', 'when', 'where', 'which', 'who', 'is', 'are', 'was', 'were', 
            'do', 'does', 'did', 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 
            'for', 'of', 'with', 'by', 'that', 'this', 'can', 'you', 'be', 'it', 'as', 'from'
        }
        question_words = [word for word in re.findall(r'\b\w+\b', question_lower) 
                         if word not in stop_words and len(word) > 2]
        
        # Score sentences using improved methodology
        sentence_scores = []
        
        for i, sentence in enumerate(sentences):
            score = 0
            sentence_lower = sentence.lower()
            sentence_words = set(sentence_lower.split())
            
            # Keyword overlap score (similar to competition winner)
            keyword_overlap = len(set(question_words) & sentence_words)
            score += keyword_overlap * 2.0
            
            # Semantic similarity using sentence embeddings
            try:
                q_embedding = sentence_model.encode([question])
                s_embedding = sentence_model.encode([sentence])
                similarity = np.dot(q_embedding[0], s_embedding[0]) / (
                    np.linalg.norm(q_embedding[0]) * np.linalg.norm(s_embedding[0])
                )
                score += similarity * 3.0
            except:
                pass
            
            # Boost definitional and explanatory sentences
            definition_phrases = [
                'is defined as', 'refers to', 'means', 'is the', 'involves', 'includes',
                'is a', 'are', 'describe', 'explain', 'definition', 'concept', 'theory'
            ]
            if any(phrase in sentence_lower for phrase in definition_phrases):
                score += 1.5
            
            # Prefer sentences with more content
            if len(sentence.split()) > 10:
                score += 0.5
                
            sentence_scores.append((score, sentence, i, keyword_overlap))
        
        # Sort by score (higher is better)
        sentence_scores.sort(key=lambda x: x[0], reverse=True)
        
        # Build comprehensive answer (like competition winner)
        answer_parts = []
        used_sentences = set()
        total_length = 0
        
        # Take top scoring sentences that add value
        for score, sentence, idx, overlap in sentence_scores:
            if (score > 0.8 and idx not in used_sentences and 
                total_length < 350 and len(answer_parts) < 3):
                
                clean_sentence = sentence.strip()
                if clean_sentence and not clean_sentence.endswith('.'):
                    clean_sentence += '.'
                
                answer_parts.append(clean_sentence)
                used_sentences.add(idx)
                total_length += len(clean_sentence)
        
        if answer_parts:
            answer = ' '.join(answer_parts)
            # Ensure reasonable length
            if len(answer) > 450:
                answer = answer[:447] + '...'
            return answer
        else:
            # Fallback to best single sentence
            if sentence_scores:
                best_sentence = sentence_scores[0][1].strip()
                if not best_sentence.endswith('.'):
                    best_sentence += '.'
                return best_sentence
            
            # Last resort
            return sentences[0].strip() + '.'
            
    except Exception as e:
        print(f"Error in extractive QA: {str(e)}")
        # Simple fallback
        sentences = context.split('.')[:2]
        return '. '.join(s.strip() for s in sentences if s.strip()) + '.'

# Alias for backward compatibility
def generate_answer(question: str, context: str) -> str:
    """Wrapper for extractive answer generation."""
    return extract_answer_from_context(question, context)

# Process all queries
print("\nProcessing all queries...")

submission_data = []

for query_item in tqdm(queries_data, desc="Processing queries"):
    query_id = query_item["query_id"]
    question = query_item["question"]
    
    try:
        # Get top 3 filtered chunks for this query
        top_chunks = get_filtered_chunks_for_query(question, k=3)
        
        if not top_chunks:
            print(f"Warning: No valid chunks found for query {query_id}")
            # Fallback - get any chunks
            chunks_and_scores = vectorstore.similarity_search_with_score(query=question, k=3)
            top_chunks = []
            for doc, score in chunks_and_scores:
                top_chunks.append({
                    "content": doc.page_content.strip(),
                    "score": float(score),
                    "metadata": {
                        "physical_page": doc.metadata["physical_page"],
                        "logical_page": doc.metadata.get("logical_page", 0),
                        "section": "",
                        "chunk_id": doc.metadata.get("chunk_id", "")
                    }
                })
        
        # Format context (without page numbers in the text)
        context = format_context(top_chunks)
        
        # Generate answer
        answer = generate_answer(question, context)
        
        # Format references as JSON
        references = format_references(top_chunks)
        
        # Add to submission data
        submission_data.append({
            "ID": query_id,
            "context": context,
            "answer": answer,
            "references": references
        })
        
        # Progress update
        if int(query_id) % 10 == 0:
            print(f"Processed query {query_id}")
            
    except Exception as e:
        print(f"Error processing query {query_id}: {str(e)}")
        # Add error entry
        submission_data.append({
            "ID": query_id,
            "context": "Error retrieving context",
            "answer": f"Error: {str(e)}",
            "references": json.dumps({"sections": [], "pages": []})
        })

# Create submission CSV
print(f"\nCreating submission.csv with {len(submission_data)} results...")
df = pd.DataFrame(submission_data)

# Save to CSV
csv_path = Path("submission.csv")
df.to_csv(csv_path, index=False, encoding="utf-8")

print(f"Results saved to: {csv_path}")
print(f"CSV shape: {df.shape}")
print("\nFirst few rows:")
print(df.head(2))

# Display sample of results
print(f"\nSample result:")
sample = df.iloc[0]
print(f"ID: {sample['ID']}")
print(f"Context: {sample['context'][:100]}...")
print(f"Answer: {sample['answer'][:100]}...")
print(f"References: {sample['references']}")

print("\nProcessing complete!")


Loading page to section mapping from working/page_to_section.json...
Loaded mapping for 494 valid physical pages
Loading queries from ../Sources/queries.json...
Loaded 50 queries

Reloading FAISS index...

Loading FLAN-T5-small with optimized settings...

Processing all queries...


Processing queries:  24%|██▍       | 12/50 [00:00<00:00, 112.89it/s]

Processed query 10
Processed query 20


Processing queries:  50%|█████     | 25/50 [00:00<00:00, 119.25it/s]

Processed query 30


Processing queries: 100%|██████████| 50/50 [00:00<00:00, 116.58it/s]

Processed query 40
Processed query 50

Creating submission.csv with 50 results...
Results saved to: submission.csv
CSV shape: (50, 4)

First few rows:
  ID                                            context  \
0  1  understanding of the mind is so limited, since...   
1  2  the other hand, serve as interconnected inform...   

                                              answer  \
0  Psychology refers to the scientific study of t...   
1  2 Cells of the Nervous System LEARNING OBJECTI...   

                                          references  
0  {"sections": ["introduction_to_psychology/what...  
1  {"sections": ["biopsychology/cells_of_the_nerv...  

Sample result:
ID: 1
Context: understanding of the mind is so limited, since thoughts, at least as we experience them, are neither...
Answer: Psychology refers to the scientific study of the mind and behavior. The scientific method is also a ...
References: {"sections": ["introduction_to_psychology/what_is_psychology"], "pages": ["20"


