In [None]:
file_path=r"D:\PROJECT_LABUBU\ai-backend\IPsec Notes.pdf"

In [3]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
import chromadb
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [5]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection("nsc")

In [6]:
if collection.count() == 0:
    docs = [page.page_content for page in pages]
    ids = [f"page_{i}" for i in range(len(docs))]
    embeddings = embedder.encode(docs).tolist()

    collection.add(
        documents=docs,
        ids=ids,
        embeddings=embeddings
    )
    print(f" Added {len(docs)} pages to Chroma")
else:
    print(" Using existing Chroma collection")

 Added 36 pages to Chroma


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
import chromadb
import torch
import re
import random

# Load models
print("Loading models...")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to existing ChromaDB
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection("history")

# FLAN-T5 setup
model_id = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(model_id)

print("Models loaded!")
print(f"Database has {collection.count()} documents")

def generate_quiz_questions_from_docs(num_questions=5):
    """Generate quiz questions directly from all available documents"""
    # Get all documents from the collection
    all_results = collection.get()
    
    if not all_results or not all_results['documents']:
        print("No documents found in the collection.")
        return None
    
    all_documents = all_results['documents']
    print(f"Found {len(all_documents)} documents in database")
    
    # Combine multiple documents for richer context
    combined_context = "\n".join(all_documents)[:2500]
    
    print("Retrieved content from your documents")
    
    # Simplified prompt that works better with FLAN-T5
    prompt = f"""Create a multiple choice question based on this text:

{combined_context}

Question format:
What [question about the content]?
A. [option 1]
B. [option 2] 
C. [option 3]
D. [option 4]
Correct answer: [A, B, C, or D]

Create one question:"""
    
    questions = []
    
    # Generate questions one by one for better results
    for i in range(num_questions):
        # Use a different subset of documents for variety
        start_idx = (i * len(all_documents) // num_questions) % len(all_documents)
        doc_subset = all_documents[start_idx:start_idx + 3]  # Use 3 documents at a time
        if len(doc_subset) < 3:
            doc_subset = all_documents[:3]  # Fallback to first 3
        
        context = "\n".join(doc_subset)[:2000]
        
        current_prompt = f"""Create a multiple choice question based on this text:

{context}

Question format:
What [question about the content]?
A. [option 1]
B. [option 2] 
C. [option 3]
D. [option 4]
Correct answer: [A, B, C, or D]

Create one question:"""
        
        input_ids = tokenizer(current_prompt, return_tensors="pt", max_length=512, truncation=True).input_ids
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the prompt from the response
        response = response.replace(current_prompt, "").strip()
        
        print(f"Generated question {i+1}: {response[:100]}...")
        
        # Try to parse this single question
        parsed_q = parse_single_question(response, "the documents", context)
        if parsed_q:
            questions.append(parsed_q)
    
    return questions

def parse_single_question(response, topic, context):
    """Parse a single generated question"""
    lines = [line.strip() for line in response.split('\n') if line.strip()]
    
    question_text = None
    options = []
    correct_answer = None
    
    for line in lines:
        # Look for question (usually starts with "What", "Who", "When", "Where", "How", "Which")
        if any(line.startswith(word) for word in ["What", "Who", "When", "Where", "How", "Which", "Why"]) and "?" in line:
            question_text = line
        
        # Look for options A, B, C, D
        elif re.match(r'^[A-D][\.\)]\s*', line, re.IGNORECASE):
            options.append(line)
        
        # Look for correct answer
        elif re.search(r'correct\s*answer\s*:?\s*([A-D])', line, re.IGNORECASE):
            match = re.search(r'correct\s*answer\s*:?\s*([A-D])', line, re.IGNORECASE)
            if match:
                correct_answer = match.group(1).lower()
    
    # If parsing failed, create a simple question from context
    if not question_text or len(options) < 4 or not correct_answer:
        return create_fallback_question(topic, context)
    
    # Convert options to consistent format
    formatted_options = []
    for opt in options[:4]:  # Only take first 4 options
        # Remove the letter prefix and add our own
        clean_opt = re.sub(r'^[A-D][\.\)]\s*', '', opt, flags=re.IGNORECASE)
        formatted_options.append(clean_opt)
    
    if len(formatted_options) == 4:
        return {
            'question': question_text,
            'options': formatted_options,
            'correct_answer': correct_answer
        }
    
    return create_fallback_question(topic, context)

def create_fallback_question(topic, context):
    """Create a simple question when parsing fails"""
    # Extract some key words from context
    words = context.split()[:50]  # First 50 words
    
    # Create a simple question
    correct_option = f"Information about {topic} from historical records"
    wrong_options = [
        "Fictional stories and myths",
        "Modern scientific theories", 
        "Contemporary political events"
    ]
    
    # Randomize option order
    all_options = [correct_option] + wrong_options
    random.shuffle(all_options)
    correct_answer = chr(ord('a') + all_options.index(correct_option))
    
    return {
        'question': f"What type of information is primarily discussed in the text about {topic}?",
        'options': all_options,
        'correct_answer': correct_answer
    }

# Main quiz function
def run_interactive_quiz():
    print("\nüéØ INTERACTIVE QUIZ TIME!")
    print("=" * 50)
    
    print("Generating 5 questions from your documents...")
    print("This may take a moment...")
    
    try:
        questions = generate_quiz_questions_from_docs(5)
        
        if not questions or len(questions) == 0:
            print("‚ùå Sorry, couldn't generate questions from the documents.")
            print("Make sure your ChromaDB has documents with sufficient content.")
            return
        
        print(f"\nüìù Generated {len(questions)} questions! Let's begin:")
        print("=" * 40)
        
        score = 0
        
        # Ask each question
        for i, q in enumerate(questions, 1):
            print(f"\nQuestion {i}: {q['question']}")
            
            # Display options with a), b), c), d) format
            option_letters = ['a', 'b', 'c', 'd']
            for j, option in enumerate(q['options'][:4]):
                print(f"  {option_letters[j]}) {option}")
            
            # Get user answer
            while True:
                answer = input("\nYour answer (a/b/c/d): ").strip().lower()
                if answer in ['a', 'b', 'c', 'd']:
                    if answer == q['correct_answer']:
                        print("‚úÖ Correct!")
                        score += 1
                    else:
                        correct_letter = q['correct_answer']
                        correct_text = q['options'][ord(correct_letter) - ord('a')]
                        print(f"‚ùå Wrong! Correct answer was {correct_letter.upper()}) {correct_text}")
                    break
                else:
                    print("Please enter a, b, c, or d")
        
        # Show final results
        print("\n" + "=" * 50)
        print("üìä FINAL RESULTS")
        print("=" * 50)
        print(f"Your Score: {score}/{len(questions)}")
        percentage = (score / len(questions)) * 100
        print(f"Percentage: {percentage:.1f}%")
        
        # Personalized feedback
        if percentage == 100:
            print("üéâ PERFECT SCORE! Outstanding knowledge!")
        elif percentage >= 80:
            print("üåü Excellent work! You know your stuff!")
        elif percentage >= 60:
            print("üëç Good job! Keep up the learning!")
        elif percentage >= 40:
            print("üìö Not bad! A bit more study will help!")
        else:
            print("üí™ Keep learning! Practice makes perfect!")
        
        print(f"\nThanks for taking the quiz on ! üéØ")
        
    except Exception as e:
        print(f"‚ùå Error during quiz: {e}")
        import traceback
        traceback.print_exc()

# Run the quiz
if __name__ == "__main__":
    run_interactive_quiz()

Loading models...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Models loaded!
Database has 1 documents

üéØ INTERACTIVE QUIZ TIME!
Generating 5 questions from your documents...
This may take a moment...
Found 1 documents in database
Retrieved content from your documents
Generated question 1: [D]...
Generated question 2: [D]...
Generated question 3: [D]...
Generated question 4: [A, B, C, or D]...
Generated question 5: [D]...

üìù Generated 5 questions! Let's begin:

Question 1: What type of information is primarily discussed in the text about the documents?
  a) Modern scientific theories
  b) Fictional stories and myths
  c) Information about the documents from historical records
  d) Contemporary political events
‚úÖ Correct!

Question 2: What type of information is primarily discussed in the text about the documents?
  a) Contemporary political events
  b) Information about the documents from historical records
  c) Fictional stories and myths
  d) Modern scientific theories
‚úÖ Correct!

Question 3: What type of information is primarily discu