<a href="https://colab.research.google.com/github/void191/lumin-20AI/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# =======================================================
# ILE — Full Colab Notebook
# Features:
# - PDF/TXT lesson ingestion
# - Embeddings search (sentence-transformers + FAISS)
# - Summarization (transformers pipeline)
# - T5 Question Generation (answer-aware)
# - Flashcards & cloze generation (NER + noun-chunks)
# - Multiple-choice distractors (semantic + heuristic)
# - Kurdish detection + translation fallback
# - Gradio UI + CSV export
# =======================================================

# Add this at the top of your notebook, before any NLTK calls:
import nltk
nltk.download('punkt_tab')  # download the missing tokenizer

# If you also use the regular sentence tokenizer, you might want:
nltk.download('punkt')


# --- Install required libraries ---
!pip install --quiet transformers sentence-transformers faiss-cpu PyPDF2 gradio langdetect nltk

# --- Imports ---
import os, re, csv, random, math
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import PyPDF2
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM, AutoModel, AutoTokenizer
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize

import sqlite3
import datetime

# Connect to local SQLite database
conn = sqlite3.connect("ile_progress.db")
c = conn.cursor()

# Create table if it doesn't exist
c.execute("""
CREATE TABLE IF NOT EXISTS progress(
    student_id TEXT,
    lesson_index INTEGER,
    action TEXT,        -- 'summary', 'flashcards', 'quiz'
    completed BOOLEAN,
    score INTEGER,
    timestamp TEXT
)
""")
conn.commit()


# --- Utility: safe model loader for seq2seq models ---
def safe_load_seq2seq(model_names):
    """
    Try to load models from a list, return (tokenizer, model, name) for the first that works.
    If none work, return (None, None, None).
    """
    for name in model_names:
        try:
            print(f"Trying to load {name} ...")
            tok = AutoTokenizer.from_pretrained(name)
            mdl = AutoModelForSeq2SeqLM.from_pretrained(name)
            print(f"Loaded {name}")
            return tok, mdl, name
        except Exception as e:
            print(f"Failed to load {name}: {e}")
            continue
    return None, None, None

# --- Config: model preference lists (can be extended) ---
QG_MODELS = [
    "valhalla/t5-small-qa-qg-hl",            # answer-aware qg (hl tokens)
    "mrm8488/t5-base-finetuned-question-generation-ap", # t5 fine-tuned for QG
    "iarfmoose/t5-base-question-generator"
]

TRANSLATE_KU_TO_EN = [
    "lingvanex/kurdish-to-english-translation",
    "abdulhade/fine-tuned-MarianMTKurdish",
]
TRANSLATE_EN_TO_KU = [
    "lingvanex/kurdish-to-english-translation",  # many models are bidirectional
    "abdulhade/fine-tuned-MarianMTKurdish",
]

# --- Load embeddings model (Sentence-Transformers) ---
print("Loading embedding model...")
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # small & fast

# --- Load summarization pipeline (fallback to distilbart) ---
print("Loading summarizer pipeline...")
try:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
except Exception as e:
    print("Primary summarizer failed, falling back to facebook/bart-large-cnn:", e)
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# --- Load QG model (tokenizer+model) ---
qg_tokenizer, qg_model, qg_model_name = safe_load_seq2seq(QG_MODELS)
if qg_model is None:
    print("No QG model loaded — question generation will use a simple template fallback.")

# --- Load translation models for Kurdish if available ---
ku_en_tok, ku_en_mdl, ku_en_name = safe_load_seq2seq(TRANSLATE_KU_TO_EN)
en_ku_tok, en_ku_mdl, en_ku_name = safe_load_seq2seq(TRANSLATE_EN_TO_KU)
if ku_en_mdl is None or en_ku_mdl is None:
    print("Kurdish translation model(s) not fully available — will attempt single-model bidirectional use or skip translation if not possible.")

# --- Helpers: PDF/text extraction ---
def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            ptext = page.extract_text()
            if ptext:
                text += ptext + "\n"
    return text

# --- Load lessons from 'lessons' folder ---
lessons_dir = "lessons"
Path(lessons_dir).mkdir(exist_ok=True)
lesson_titles = []
lesson_texts = []
for file in sorted(os.listdir(lessons_dir)):
    path = os.path.join(lessons_dir, file)
    if file.lower().endswith(".pdf"):
        t = extract_pdf_text(path)
        if t.strip():
            lesson_titles.append(file)
            lesson_texts.append(t)
    elif file.lower().endswith(".txt"):
        with open(path, "r", encoding="utf-8") as fh:
            txt = fh.read()
            if txt.strip():
                lesson_titles.append(file)
                lesson_texts.append(txt)
if len(lesson_texts) == 0:
    print("No lessons found in 'lessons' folder. Add PDFs/TXTs and rerun.")
else:
    print(f"Loaded {len(lesson_texts)} lessons.")

# --- Build embeddings + FAISS index if lessons exist ---
if len(lesson_texts) > 0:
    lesson_embeddings = embed_model.encode(lesson_texts)
    lesson_embeddings = np.array(lesson_embeddings).astype('float32')
    dim = lesson_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(lesson_embeddings)
    print("FAISS index built.")

# --- Language detection helper (very lightweight) ---
def detect_language(text):
    try:
        return detect(text)
    except Exception:
        return "en"

# --- Translation helpers (Kurdish <-> English) ---
def translate_text(text, src_to_tgt="ku2en"):
    """
    src_to_tgt: "ku2en" or "en2ku". Uses loaded models if available.
    Falls back to returning original text if no translator available.
    """
    if src_to_tgt == "ku2en":
        if ku_en_mdl is None or ku_en_tok is None:
            return text
        tok = ku_en_tok
        mdl = ku_en_mdl
    else:
        if en_ku_mdl is None or en_ku_tok is None:
            return text
        tok = en_ku_tok
        mdl = en_ku_mdl
    try:
        inputs = tok(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = mdl.generate(**inputs, max_length=1024)
        res = tok.batch_decode(outputs, skip_special_tokens=True)
        return res[0]
    except Exception as e:
        print("Translation failed:", e)
        return text

# --- Summarization utility (chunking for long docs) ---
def summarize_text(text, max_chunk_chars=1000):
    t = re.sub(r"\s+", " ", text).strip()
    if len(t) <= max_chunk_chars:
        out = summarizer(t, max_length=120, min_length=30, do_sample=False)[0]['summary_text']
        return out
    chunks = []
    pos = 0
    while pos < len(t):
        chunk = t[pos: pos + max_chunk_chars]
        if pos + max_chunk_chars < len(t):
            last = chunk.rfind('. ')
            if last != -1:
                chunk = chunk[:last+1]
        chunks.append(chunk.strip())
        pos += len(chunk)
    summaries = [summarizer(c, max_length=120, min_length=30, do_sample=False)[0]['summary_text'] for c in chunks]
    combined = " ".join(summaries)
    if len(combined) > max_chunk_chars:
        combined = summarizer(combined, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
    return combined

# --- Simple NER and noun-chunk extraction using sentence splitting and heuristics (no heavy Kurdish NER unless KuBERT is used) ---
import spacy
try:
    nlp = spacy.load("en_core_web_sm")
except Exception:
    # attempt to download if not present
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def extract_candidates(text, max_cand=200):
    doc = nlp(text)
    ents = [ent.text.strip() for ent in doc.ents if len(ent.text.strip())>1]
    noun_chunks = [nc.text.strip() for nc in doc.noun_chunks if len(nc.text.strip())>2]
    cand = []
    seen = set()
    for item in ents + noun_chunks:
        low = item.lower()
        if low in seen: continue
        seen.add(low)
        cand.append(item)
        if len(cand) >= max_cand: break
    return cand

# --- QG: generate question for a given answer span and context using loaded QG model if present ---
def generate_question_with_t5(answer, context, max_length=64):
    if qg_model is None or qg_tokenizer is None:
        # fallback simple template
        return f"What is {answer}?"
    model = qg_model
    tok = qg_tokenizer
    # some models expect highlight tokens around the answer: <hl> ... <hl>
    # create input following patterns used by valhalla/mrm8488 (try multiple patterns)
    patterns = [
        f"generate question: {context} </s> answer: {answer}",            # generic
        f"<hl> {answer} <hl> {context}",                                  # valhalla style
        f"answer: {answer} context: {context} </s>"                       # mrm8488 style
    ]
    for inp in patterns:
        try:
            inputs = tok.encode(inp, return_tensors="pt", truncation=True, max_length=512)
            outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
            q = tok.decode(outputs[0], skip_special_tokens=True)
            # simple cleanup
            q = q.strip()
            if len(q) > 5:
                return q
        except Exception:
            continue
    # fallback
    return f"What is {answer}?"

# --- Multiple-choice distractor generation ---
def generate_mcq(question, answer, context_text, lesson_text, num_distractors=3):
    """
    Strategy:
    - Extract candidate distractors from lesson: entities, noun-chunks
    - Filter by length & not overlapping answer
    - Score semantic similarity via sentence-transformer embeddings: choose candidates that are somewhat close to answer (not too close)
    - If not enough, create synthetic distractors by shuffling words or using synonyms heuristic
    """
    candidates = extract_candidates(lesson_text, max_cand=200)
    # filter out those that are identical or subsumed in the answer
    filt = []
    ans_low = answer.lower()
    for c in candidates:
        c_low = c.lower()
        if c_low == ans_low or c_low in ans_low or ans_low in c_low:
            continue
        if len(c) < 2 or len(c) > 60:
            continue
        filt.append(c)
    # If not enough candidates, also try sentence-level chunks
    if len(filt) < num_distractors:
        sents = sent_tokenize(lesson_text)
        for s in sents:
            candidate = " ".join(s.split()[:6])
            if candidate.lower() != ans_low and candidate.lower() not in [f.lower() for f in filt]:
                filt.append(candidate)
            if len(filt) >= num_distractors: break
    # Compute embeddings to pick candidates at medium similarity distance
    try:
        pool = [answer] + filt
        emb = embed_model.encode(pool)
        ans_emb = emb[0]
        cand_embs = emb[1:]
        sims = np.dot(cand_embs, ans_emb) / (np.linalg.norm(cand_embs, axis=1) * (np.linalg.norm(ans_emb)+1e-9))
        # Select distractors with mid-range similarity
        idxs = np.argsort(-sims)  # descending similarity
        chosen = []
        for i in idx:
            if len(chosen) >= num_distractors: break
            # avoid extremely similar (near-duplicates)
            if sims[i] > 0.95: continue
            chosen.append(filt[i])
        # if still short, pad with random candidates
        if len(chosen) < num_distractors:
            extras = [c for c in filt if c not in chosen]
            random.shuffle(extras)
            chosen += extras[:(num_distractors-len(chosen))]
        # Final fallback: generate word-level distractors
        if len(chosen) < num_distractors:
            words = [w for w in answer.split() if len(w)>3]
            for w in words:
                chosen.append("".join(random.sample(w, len(w))))
                if len(chosen) >= num_distractors: break
        # ensure uniqueness & strip
        final = []
        for c in chosen:
            c = c.strip()
            if c.lower() == answer.lower(): continue
            if c not in final:
                final.append(c)
            if len(final) >= num_distractors: break
        return final[:num_distractors]
    except Exception as e:
        # simple random fallback
        random.shuffle(filt)
        return filt[:num_distractors]

# --- Flashcard / QA generation pipeline (combines rule-based + T5 QG) ---
def generate_flashcards_and_mcqs(lesson_text, max_cards=20, use_t5_qg=True, language_hint=None):
    """
    Returns a list of items:
    {
      'type':'short'|'cloze'|'qa',
      'answer': ...,
      'question': ...,
      'distractors': [...],  # optional for MCQ
      'context': ...
    }
    """
    text = lesson_text
    # Detect language if not provided
    lang = language_hint or detect_language(text[:200])
    translated_to_en = False
    if lang.startswith("ku") and (ku_en_mdl is not None):
        # translate to English for processing
        text_en = translate_text(text, src_to_tgt="ku2en")
        translated_to_en = True
    else:
        text_en = text

    # Extract candidates
    candidates = extract_candidates(text_en, max_cand=200)
    cards = []
    seen_answers = set()
    # Use entities first for short-answer Qs (try to create Q via T5)
    for ent in candidates:
        if len(cards) >= max_cards: break
        ans = ent
        if len(ans.strip()) < 2: continue
        low = ans.lower()
        if low in seen_answers: continue
        seen_answers.add(low)
        # find a sentence containing the answer
        sents = sent_tokenize(text_en)
        ctx = next((s for s in sents if ans in s), sents[0] if sents else text_en[:200])
        # generate question by T5 if enabled
        if use_t5_qg and qg_model is not None:
            q = generate_question_with_t5(ans, ctx)
        else:
            q = f"What is {ans}?"
        # generate distractors from lesson
        distractors = generate_mcq(q, ans, ctx, text_en, num_distractors=3)
        # if we translated, translate back question/answer/distractors to Kurdish
        if translated_to_en and en_ku_mdl is not None:
            q_local = translate_text(q, src_to_tgt="en2ku")
            ans_local = translate_text(ans, src_to_tgt="en2ku")
            distractors_local = [translate_text(d, src_to_tgt="en2ku") for d in distractors]
        else:
            q_local, ans_local, distractors_local = q, ans, distractors
        cards.append({
            'type':'qa',
            'question': q_local,
            'answer': ans_local,
            'distractors': distractors_local,
            'context': ctx
        })
    # If not enough, generate cloze cards using noun-chunks
    if len(cards) < max_cards:
        doc = nlp(text_en)
        for nc in doc.noun_chunks:
            if len(cards) >= max_cards: break
            chunk = nc.text.strip()
            low = chunk.lower()
            if low in seen_answers or len(chunk) < 3: continue
            seen_answers.add(low)
            sents = sent_tokenize(text_en)
            ctx = next((s for s in sents if chunk in s), sents[0] if sents else text_en[:200])
            cloze_q = ctx.replace(chunk, "_____")
            # create simple MCQ distractors
            distractors = generate_mcq(cloze_q, chunk, ctx, text_en, num_distractors=3)
            if translated_to_en and en_ku_mdl is not None:
                q_local = translate_text(cloze_q, src_to_tgt="en2ku")
                ans_local = translate_text(chunk, src_to_tgt="en2ku")
                distractors_local = [translate_text(d, src_to_tgt="en2ku") for d in distractors]
            else:
                q_local, ans_local, distractors_local = cloze_q, chunk, distractors
            cards.append({
                'type':'cloze',
                'question': q_local,
                'answer': ans_local,
                'distractors': distractors_local,
                'context': ctx
            })
    # Final fallback: chunk-based explainers
    if len(cards) == 0:
        sents = sent_tokenize(text_en)
        for i, s in enumerate(sents[:max_cards]):
            cards.append({
                'type':'short',
                'question': f"Explain: {s[:80]}...",
                'answer': s,
                'distractors': [],
                'context': s
            })
    return cards[:max_cards]

# --- Export to CSV ---
def export_cards_to_csv(cards, filename=None):
    if filename is None:
        filename = "ile_flashcards_export.csv"
    out_path = os.path.join("/content", filename)
    with open(out_path, "w", newline="", encoding="utf-8") as fh:
        writer = csv.writer(fh)
        writer.writerow(["type", "question", "answer", "distractors", "context"])
        for c in cards:
            writer.writerow([c['type'], c['question'], c['answer'], "||".join(c.get('distractors',[])), c.get('context','')])
    return out_path, len(cards)

# --- Recommendation function (search) ---
def recommend_by_query(query, k=3):
    if len(lesson_texts)==0:
        return []
    q_emb = embed_model.encode([query]).astype('float32')
    D, I = index.search(q_emb, k)
    out = []
    for idx in I[0]:
        out.append({'index': int(idx), 'title': lesson_titles[idx], 'snippet': lesson_texts[idx][:800]})
    return out

# --- Gradio UI functions ---
def ui_recommend(query):
    r = recommend_by_query(query, k=3)
    if not r:
        return "No lessons loaded."
    text = ""
    for item in r:
        text += f"Index: {item['index']}  | Title: {item['title']}\nSnippet:\n{item['snippet']}\n\n---\n"
    return text

def ui_summarize(idx):
    try:
        i = int(idx)
        t = lesson_texts[i]
        lang = detect_language(t[:200])
        if lang.startswith("ku") and ku_en_mdl is not None:
            t_en = translate_text(t, src_to_tgt="ku2en")
            s_en = summarize_text(t_en)
            if en_ku_mdl is not None:
                s_local = translate_text(s_en, src_to_tgt="en2ku")
                return s_local
            else:
                return s_en
        else:
            return summarize_text(t)
    except Exception as e:
        return f"Error: {e}"

def ui_generate(idx, max_cards=10):
    try:
        i = int(idx)
        t = lesson_texts[i]
        lang = detect_language(t[:200])
        cards = generate_flashcards_and_mcqs(t, max_cards=max_cards, use_t5_qg=True, language_hint=lang)
        out = ""
        for n,c in enumerate(cards):
            out += f"{n+1}. [{c['type']}] Q: {c['question']}\n   A: {c['answer']}\n   Distractors: {', '.join(c.get('distractors',[]))}\n\n"
        return out
    except Exception as e:
        return f"Error: {e}"

def ui_export(idx, max_cards=50):
    try:
        i = int(idx)
        t = lesson_texts[i]
        lang = detect_language(t[:200])
        cards = generate_flashcards_and_mcqs(t, max_cards=max_cards, use_t5_qg=True, language_hint=lang)
        path, count = export_cards_to_csv(cards, filename=f"flashcards_lesson_{i}.csv")
        return f"Exported {count} cards to {path}"
    except Exception as e:
        return f"Error: {e}"

# --- Wrapped UI functions with progress logging ---
def ui_summarize_with_progress(idx, student_id="student1"):
    summary = ui_summarize(idx)  # your existing function
    try:
        log_progress(student_id, int(idx), action="summary")
    except Exception as e:
        print(f"Error logging progress for summarize: {e}")
    return summary

def ui_generate_with_progress(idx, max_cards=10, student_id="student1"):
    cards_text = ui_generate(idx, max_cards)  # your existing function
    try:
        log_progress(student_id, int(idx), action="flashcards")
    except Exception as e:
        print(f"Error logging progress for flashcards: {e}")
    return cards_text

def ui_export_with_progress(idx, max_cards=50, student_id="student1"):
    export_status = ui_export(idx, max_cards)  # existing export function
    try:
        log_progress(student_id, int(idx), action="export")
    except Exception as e:
        print(f"Error logging progress for export: {e}")
    return export_status


# --- Build Gradio App ---
with gr.Blocks() as demo:
    gr.Markdown("# ILE — QG + MCQ + Kurdish support")
    with gr.Row():
        student_id = gr.Textbox(label="Student ID", value="student1")
    with gr.Row():
        q = gr.Textbox(label="Search / Ask", placeholder="e.g. 'neural networks intro' or a question")
        btn = gr.Button("Recommend Lessons")
        out = gr.Textbox(label="Recommendations", lines=8)
    with gr.Row():
        idx = gr.Number(value=0, label="Lesson index (0-based)")
        sum_btn = gr.Button("Summarize Lesson")
        sum_out = gr.Textbox(label="Summary", lines=6)
    with gr.Row():
        gen_max = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max cards")
        gen_btn = gr.Button("Generate Q/A + MCQs")
        gen_out = gr.Textbox(label="Generated Cards", lines=12)
    with gr.Row():
        exp_max = gr.Slider(minimum=1, maximum=200, value=50, step=1, label="Export max cards")
        exp_btn = gr.Button("Export to CSV")
        exp_out = gr.Textbox(label="Export status", lines=2)

    btn.click(fn=ui_recommend, inputs=q, outputs=out)
    sum_btn.click(fn=ui_summarize_with_progress, inputs=[idx, student_id], outputs=sum_out)
    gen_btn.click(fn=ui_generate_with_progress, inputs=[idx, gen_max, student_id], outputs=gen_out)
    exp_btn.click(fn=ui_export_with_progress, inputs=[idx, exp_max, student_id], outputs=exp_out)

print("Launching Gradio app...")
demo.launch(share=False)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading embedding model...
Loading summarizer pipeline...


Device set to use cpu


Trying to load valhalla/t5-small-qa-qg-hl ...
Loaded valhalla/t5-small-qa-qg-hl
Trying to load lingvanex/kurdish-to-english-translation ...
Failed to load lingvanex/kurdish-to-english-translation: <class 'transformers.models.deprecated.van.configuration_van.VanConfig'>
Trying to load abdulhade/fine-tuned-MarianMTKurdish ...




Loaded abdulhade/fine-tuned-MarianMTKurdish
Trying to load lingvanex/kurdish-to-english-translation ...
Failed to load lingvanex/kurdish-to-english-translation: <class 'transformers.models.deprecated.van.configuration_van.VanConfig'>
Trying to load abdulhade/fine-tuned-MarianMTKurdish ...
Loaded abdulhade/fine-tuned-MarianMTKurdish
Loaded 1 lessons.
FAISS index built.
Launching Gradio app...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [12]:
import os
import shutil

source_path = "/content/business_basics_1_english_everywhere_by_stephanie_jones_compress.pdf"
lessons_dir = "lessons"
destination_path = os.path.join(lessons_dir, os.path.basename(source_path))

# Create the 'lessons' directory if it doesn't exist
os.makedirs(lessons_dir, exist_ok=True)

# Move the file
shutil.move(source_path, destination_path)

print(f"Moved '{source_path}' to '{destination_path}'")

FileNotFoundError: [Errno 2] No such file or directory: '/content/business_basics_1_english_everywhere_by_stephanie_jones_compress.pdf'

In [None]:
def log_progress(student_id, lesson_index, action, completed=True, score=None):
    timestamp = datetime.datetime.now().isoformat()
    conn = sqlite3.connect("ile_progress.db")
    c = conn.cursor()
    c.execute("INSERT INTO progress VALUES (?, ?, ?, ?, ?, ?)",
              (student_id, lesson_index, action, completed, score if score else 0, timestamp))
    conn.commit()
    conn.close()