In [11]:
import re
import os
import sys
import csv
import time
from docx import Document
import pdfplumber
import ollama

# --- For Windows users... ---
if sys.platform == 'win32':
    try: import win32com.client as win32
    except ImportError: print("Warning: .doc support disabled."); win32 = None
else: win32 = None

# ================================================================================
# PART 0: DEPARTMENT & KNOWLEDGE AREA MAPPINGS
# ================================================================================
DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN', 'NAIS', 'PHUM', 'STS'],
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Archaeology and the Ancient World': 'ARCH',
    'Bio-Medical (PLME & MED)': 'MED',
    'Biology': 'BIOL',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Center for Language Studies': ['ARAB', 'EINT', 'HNDI', 'LANG', 'PRSN', 'SIGN', 'TKSH', 'YORU'],
    'Chemistry': 'CHEM',
    'Classics': ['CLAS', 'CREK', 'LATN', 'MGRK', 'SANS'],
    'Cognitive, Linguistic, and Psychological Sciences': ['CLPS', 'LING'],
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Data Science Initiative': ['DATA', 'DSIO'],
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': ['CHIN', 'EAST', 'JAPN', 'KREA', 'VIET'],
    'Economics': 'ENCO',
    'Education': 'EDUC',
    'Egyptology and Assyriology': ['ASYR', 'EGYT'],
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'Environmental Studies': 'ENVS',
    'French Studies': 'FREN',
    'German Studies': 'GRMN',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'Italian Studies': 'ITAL',
    'Judaic Studies': ['HEBR', 'JUDS'],
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Music': 'MUSC',
    'Neuroscience': 'NEUR',
    'Pembroke Center Teach and Rearch': 'GNSS',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': ['BHDS', 'GPHP', 'HCL', 'PHP'],
    'Religious Studies': ['COST', 'RELS'],
    'Slavic Studies': ['CZCH','PLSH','RUSS', 'SLAV'],
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Urban Studies': 'URBN',
    'Visual Art': 'VISA',
    'Watson Institute': ['IAPA', 'MPA'],
}
# reverse lookup: code >> dept name
CODE_TO_DEPT = {}
for dept, codes in DEPARTMENT_MAP.items():
    if isinstance(codes, list):
        for c in codes:
            CODE_TO_DEPT[c] = dept
    else:
        CODE_TO_DEPT[codes] = dept
# map each department to a knowledge area

KNOWLEDGE_AREA_MAP = {
    'Africana Studies': 'Social Sciences',
    'American Studies': 'Social Sciences',
    'Anthropology': 'Social Sciences',
    'Applied Mathematics': 'Physical Sciences',
    'Archaeology and the Ancient World': 'Humanities',
    'Bio-Medical (PLME & MED)': 'Life Sciences',
    'Biology': 'Life Sciences',
    'Brown Arts Institute': 'Arts',
    'Business, Entrepreneurship, Organizations': 'Social Sciences',
    'Center for Language Studies': 'Humanities',
    'Chemistry': 'Physical Sciences',
    'Classics': 'Humanities',
    'Cognitive and Psychological Sciences': 'Life Sciences',
    'Cogut Institute for the Humanities': 'Humanities',
    'Comparative Literature': 'Humanities',
    'Computer Science': 'Physical Sciences',
    'Data Science Initiative': 'Physical Sciences',
    'Early Modern World': 'Humanities',
    'Earth, Environmental and Planetary Sciences': 'Physical Sciences',
    'East Asian Studies': 'Humanities',
    'Economics': 'Social Sciences',
    'Education': 'Social Sciences',
    'Egyptology and Assyriology': 'Humanities',
    'Engineering': 'Physical Sciences',
    'English': 'Humanities',
    'Environmental Studies': 'Physical Sciences',
    'French Studies': 'Humanities',
    'German Studies': 'Humanities',
    'Hispanic Studies': 'Humanities',
    'History': 'Social Sciences',
    'History of Art and Architecture': 'Humanities',
    'Italian Studies': 'Humanities',
    'Judaic Studies': 'Humanities',
    'Linguistics': 'Social Sciences',
    'Literary Arts': 'Humanities',
    'Mathematics': 'Physical Sciences',
    'Medieval Studies': 'Humanities',
    'Middle East Studies': 'Humanities',
    'Modern Culture and Media': 'Humanities',
    'Music': 'Humanities',
    'Neuroscience': 'Life Sciences',
    'Pembroke Center Teach and Rearch': 'Humanities',
    'Philosophy': 'Humanities',
    'Physics': 'Physical Sciences',
    'Political Science': 'Social Sciences',
    'Portuguese and Brazilian Studies': 'Humanities',
    'Public Health': 'Life Sciences',
    'Religious Studies': 'Humanities',
    'Slavic Studies': 'Humanities',
    'Sociology': 'Social Sciences',
    'Theatre Arts and Performance Studies': 'Humanities',
    'Urban Studies': 'Social Sciences',
    'Visual Art': 'Humanities',
    'Watson Institute': 'Social Sciences',
}

# ==============================================================================
# PART 1: DEFINITIONS
# ==============================================================================
AI_CONTEXT_WORDS = ['ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot', 'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney', 'stable diffusion', 'ai tool', 'ai tools']
POLICY_KEYWORDS = ['academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized', 'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution', 'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure', 'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance', 'policy', 'rule']
COURSE_CODE_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': 'AMST', 
    'American Studies 1': 'ETHN', 
    'American Studies 2': 'NAIS', 
    'American Studies 3': 'PHUM', 
    'American Studies 4': 'STS',
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Archaeology and the Ancient World': 'ARCH',
    'Bio-Medical (PLME & MED)': 'MED',
    'Biology': 'BIOL',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Center for Language Studies': 'ARAB', 
    'Center for Language Studies 1': 'EINT', 
    'Center for Language Studies 2': 'HNDI', 
    'Center for Language Studies 3': 'LANG', 
    'Center for Language Studies 4': 'PRSN', 
    'Center for Language Studies 5': 'SIGN', 
    'Center for Language Studies 6': 'TKSH', 
    'Center for Language Studies 7': 'YORU',
    'Chemistry': 'CHEM',
    'Classics': 'CLAS', 
    'Classics 1': 'CREK', 
    'Classics 2': 'LATN', 
    'Classics 3':'MGRK', 
    'Classics 4':'SANS',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS', 
    'Cognitive, Linguistic, and Psychological Sciences 1': 'LING',
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Data Science Initiative': 'DATA', 
    'Data Science Initiative 1': 'DSIO',
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': 'EAST', 
    'East Asian Studies 1': 'CHIN',
    'East Asian Studies 2': 'JAPN', 
    'East Asian Studies 3': 'KREA', 
    'East Asian Studies 4': 'VIET',
    'Economics': 'ENCO',
    'Education': 'EDUC',
    'Assyriology': 'ASYR',
    'Egyptology': 'EGYT',
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'Environmental Studies': 'ENVS',
    'French Studies': 'FREN',
    'German Studies': 'GRMN',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'Italian Studies': 'ITAL',
    'Judaic Studies': 'JUDS',
    'Judaic Studies 1': 'HEBR',
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Music': 'MUSC',
    'Neuroscience': 'NEUR',
    'Pembroke Center Teach and Rearch': 'GNSS',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': 'BHDS', 
    'Public Health 1': 'GPHP', 
    'Public Health 2': 'HCL', 
    'Public Health 3': 'PHP',
    'Religious Studies': 'RELS',
    'Religious Studies 1': 'COST',
    'Slavic Studies': 'SLAV',
    'Slavic Studies 1': 'CZCH',
    'Slavic Studies 2': 'PLSH',
    'Slavic Studies 3': 'RUSS',
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Urban Studies': 'URBN',
    'Visual Art': 'VISA',
    'Watson Institute': 'IAPA', 
    'Watson Institute 1': 'MPA',
}

HEADER_PATTERN = re.compile(
    r'.*\b('
    r'(ai|artificial\sintelligence|generative\s+ai)\s+.*\bpolicy'
    r'|'
    r'policy\s+.*\b(ai|artificial\sintelligence|generative\s+ai)'
    r')\b.*',
    re.IGNORECASE
)
AI_TRIGGER_PATTERN = re.compile(r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b', re.IGNORECASE)
POLICY_PATTERN = re.compile(r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b', re.IGNORECASE)

# ==============================================================================
# PART 2: TEXT EXTRACTION 
# ==============================================================================

def extract_paragraphs_from_doc(doc_path):
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        return []
    word = None; doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(doc_path))
        return [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
    except Exception as e:
        print(f"Error processing .doc file with MS Word: {e}")
        return []
    finally:
        if doc: doc.Close(False)
        if word: word.Quit()

def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path)
        blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        blocks.append(cell.text)
        return blocks
    except Exception as e:
        print(f"Error reading DOCX file {doc_path}: {e}")
        return []

def _reconstruct_paragraphs_from_page(page):
    words = page.extract_words(keep_blank_chars=False, x_tolerance=2)
    if not words: return []
    
    lines = {}
    for word in words:
        line_top = round(word['top'], 2)
        if line_top not in lines:
            lines[line_top] = []
        lines[line_top].append(word)

    for line_top in lines:
        lines[line_top].sort(key=lambda w: w['x0'])
        
    sorted_lines = sorted(lines.items(), key=lambda item: item[0])
    
    reconstructed_lines = []
    line_heights = []
    last_top = None
    for top, words_in_line in sorted_lines:
        text = " ".join(w['text'] for w in words_in_line)
        reconstructed_lines.append({'top': top, 'text': text})
        if last_top is not None:
            line_heights.append(top - last_top)
        last_top = top

    if not reconstructed_lines: return []

    avg_line_height = sum(line_heights) / len(line_heights) if line_heights else 12
    paragraph_break_threshold = avg_line_height * 1.5
    
    page_paragraphs = []
    current_paragraph = reconstructed_lines[0]['text']
    for i in range(1, len(reconstructed_lines)):
        prev_line, curr_line = reconstructed_lines[i-1], reconstructed_lines[i]
        if (curr_line['top'] - prev_line['top']) > paragraph_break_threshold:
            page_paragraphs.append(current_paragraph)
            current_paragraph = curr_line['text']
        else:
            current_paragraph += " " + curr_line['text']
    page_paragraphs.append(current_paragraph)
    
    return page_paragraphs

def extract_paragraphs_from_pdf(pdf_path):
    all_paragraphs = []
    carry_over_paragraph = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                paragraphs_on_page = _reconstruct_paragraphs_from_page(page)
                if not paragraphs_on_page: continue
                if carry_over_paragraph:
                    paragraphs_on_page[0] = carry_over_paragraph + " " + paragraphs_on_page[0]
                    carry_over_paragraph = ""
                last_para = paragraphs_on_page[-1]
                if not last_para.strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry_over_paragraph = paragraphs_on_page.pop()
                all_paragraphs.extend(paragraphs_on_page)
        if carry_over_paragraph:
            all_paragraphs.append(carry_over_paragraph)
        print("Successfully extracted paragraphs using coordinate-based method.")
        return all_paragraphs
    except Exception as e:
        print(f"Coordinate-based parsing failed: {e}. No OCR fallback implemented.")
        return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================

def find_course_code_rule_based(paragraphs, search_limit=30):
    for long_name, short_code in COURSE_CODE_MAP.items():
        pattern = re.compile(r'\b' + re.escape(long_name) + r'\s*(\d{3,4}[A-Z]?)\b', re.IGNORECASE)
        for para in paragraphs[:search_limit]:
            if match := pattern.search(para):
                return f"{short_code} {match.group(1)}"

    SEMESTER_TERMS = ['FALL', 'SPRING', 'WINTER', 'SUMMER']
    exclude_pattern = '|'.join(SEMESTER_TERMS)
    
    fallback_pattern = re.compile(
        r'\b((?!' + exclude_pattern + r'\b)[A-Z]{2,4}(\s*/\s*[A-Z]{2,4})*)\s*(\d{3,4}[A-Z]?)\b'
    )
    for para in paragraphs[:search_limit]:
        if match := fallback_pattern.search(para.upper()):
            original_match_text = para[match.start():match.end()]
            sub_match = fallback_pattern.match(original_match_text)
            if sub_match:
                 dept_part = sub_match.group(1).replace(" ", "")
                 num_part = sub_match.group(3)
                 return f"{dept_part} {num_part}"
    return None

def _normalize_for_comparison(text: str) -> str:
    """
    Converts text to a 'canonical' form for robust comparison by lowercasing
    and removing all non-alphanumeric characters.
    """
    return "".join(char.lower() for char in text if char.isalnum())

def refine_policy_with_ollama(context_block, model_name="deepseek-r1:1.5b"):
    """
    Asks the LLM to extract the core policy with retries, progressive prompting,
    and robust, normalized verbatim quote verification.
    Returns:
        tuple: (result, flags)
    """
    print(f"\n--- Handing off to Ollama model '{model_name}' to refine the policy block... ---")
    
    SEPARATOR_START = "[---POLICY_TEXT_START---]"
    SEPARATOR_END = "[---POLICY_TEXT_END---]"
    MAX_RETRIES = 3

    prompts = [
        f"""Analyze the syllabus section to find the policy on "Artificial Intelligence".
Instructions:
1. Provide a brief, one-sentence explanation.
2. Output the start separator: {SEPARATOR_START}
3. Quote the complete AI policy verbatim.
4. Output the end separator: {SEPARATOR_END}
If no policy is found, respond ONLY with "None".
Syllabus Section:\n---\n{context_block}\n---""",

        f"""RE-EVALUATION: The previous analysis was likely wrong. The text IS KNOWN to contain a policy on "Artificial Intelligence". Locate and extract it.
Instructions:
1. Explain your corrected finding in one sentence.
2. Output the start separator: {SEPARATOR_START}
3. Quote the policy verbatim. DO NOT summarize.
4. Output the end separator: {SEPARATOR_END}
Do not respond "None". Find the policy.
Syllabus Section:\n---\n{context_block}\n---""",

        f"""FINAL ATTEMPT: You MUST extract the policy about "Artificial Intelligence". It is there. Your task is to EXTRACT it, not to decide if it exists.
Instructions:
1. Find the rules for using AI.
2. Output: {SEPARATOR_START}
3. Copy the paragraph(s) exactly.
4. Output: {SEPARATOR_END}
Extract it now.
Syllabus Section:\n---\n{context_block}\n---"""
    ]
    
    normalized_context = _normalize_for_comparison(context_block)

    try:
        for attempt in range(MAX_RETRIES):
            current_prompt = prompts[attempt]
            print(f"INFO: Attempt {attempt + 1} of {MAX_RETRIES} to query LLM...")
            
            response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': current_prompt}])
            llm_response = response['message']['content'].strip()

            if (llm_response.strip().upper() == "NONE"):
                print(f"WARN: Attempt {attempt + 1} failed. LLM responded 'None'.")
                if attempt < MAX_RETRIES - 1: time.sleep(1)
                continue

            if SEPARATOR_START in llm_response and SEPARATOR_END in llm_response:
                after_start = llm_response.split(SEPARATOR_START, 1)[1]
                policy_text = after_start.split(SEPARATOR_END, 1)[0].strip()
                
                if not policy_text:
                     print(f"WARN: Attempt {attempt + 1} failed. LLM returned an empty policy.")
                     if attempt < MAX_RETRIES - 1: time.sleep(1)
                     continue

                normalized_policy = _normalize_for_comparison(policy_text)
                if normalized_policy not in normalized_context:
                    print(f"WARN: Attempt {attempt + 1} failed. Normalized quote not in context (likely a summary).")
                    if attempt < MAX_RETRIES - 1: time.sleep(1)
                    continue

                print("INFO: LLM refinement successful and passed verification.")
                result = [{'text': policy_text, 'reason': f'Refined by {model_name} (Attempt {attempt+1})'}]
                return (result, [])
            else:
                print(f"WARN: Attempt {attempt + 1} failed. LLM did not use required separators.")
                if attempt < MAX_RETRIES - 1: time.sleep(1)
        
        return ([], ['LLM_FINAL_FAILURE'])
            
    except Exception as e:
        print(f"--- ERROR: Could not connect to Ollama: {e} ---")
        return ([], ['LLM_CONNECTION_ERROR'])

def analyze_policy_with_clustering(paragraphs):
    """
    Clusters mentions, finds the best block using a two-tier scoring system,
    and returns it along with a flag if multiple clusters were found.
    Returns:
        tuple: (result, flags)
    """
    flags = []
    triggers = []
    
    for i, para in enumerate(paragraphs):
        is_header = HEADER_PATTERN.match(para) and len(para.split()) < 15
        contains_ai = AI_TRIGGER_PATTERN.search(para)
        contains_policy = POLICY_PATTERN.search(para)
        if is_header:
            triggers.append({'index': i, 'type': 'header', 'weight': 10}) 
        elif contains_ai and contains_policy:
            triggers.append({'index': i, 'type': 'strong_mention', 'weight': 3})
        elif contains_ai:
            triggers.append({'index': i, 'type': 'weak_mention', 'weight': 1})
    
    if not triggers:
        return ([], [])

    clusters = []
    if triggers:
        current_cluster = [triggers[0]]
        for i in range(1, len(triggers)):
            if triggers[i]['index'] - current_cluster[-1]['index'] <= 3:
                current_cluster.append(triggers[i])
            else:
                clusters.append(current_cluster)
                current_cluster = [triggers[i]]
        clusters.append(current_cluster)

    if len(clusters) > 1:
        print(f"INFO: Found {len(clusters)} distinct AI-related clusters. Flagging for review.")
        flags.append('MULTIPLE_CLUSTERS')
    
    # === MODIFICATION START: Two-tier scoring system ===
    best_cluster_info = {'score': -1, 'policy_density': -1, 'block': []}
    
    for cluster in clusters:
        min_index = min(t['index'] for t in cluster)
        max_index = max(t['index'] for t in cluster)
        
        # --- Primary Score Calculation ---
        has_header = any(t['type'] == 'header' for t in cluster) or (min_index > 0 and HEADER_PATTERN.match(paragraphs[min_index - 1]))
        score = sum(t['weight'] for t in cluster) + (20 if has_header else 0)
        
        # --- Tie-breaker (Secondary Score) Calculation ---
        start_idx_for_text = min_index - 1 if has_header and not any(t['type'] == 'header' for t in cluster) else min_index
        end_idx_for_text = min(len(paragraphs), max_index + 1)
        cluster_block_text = "\n\n".join(paragraphs[start_idx_for_text:end_idx_for_text])
        num_policy_words = len(POLICY_PATTERN.findall(cluster_block_text))
        policy_density = num_policy_words / (len(cluster_block_text.split()) + 1e-6) # Add epsilon to avoid division by zero
        
        # --- Update Best Cluster based on two-tier logic ---
        is_best = False
        if score > best_cluster_info['score']:
            is_best = True
        elif score == best_cluster_info['score'] and policy_density > best_cluster_info['policy_density']:
            print(f"INFO: Tie-breaker activated. New cluster with density {policy_density:.4f} is better than previous {best_cluster_info['policy_density']:.4f}.")
            is_best = True
        
        if is_best:
            start_index = start_idx_for_text
            end_index = end_idx_for_text
            while end_index < len(paragraphs) and len(paragraphs[end_index].split()) > 5:
                end_index += 1
                
            best_cluster_info = {
                'score': score, 
                'policy_density': policy_density, 
                'block': paragraphs[start_index:end_index]
            }
    # === MODIFICATION END ===

    if best_cluster_info['block']:
        final_text = "\n\n".join(best_cluster_info['block'])
        # Add policy_density to the reason for better debugging
        reason_str = (
            f"Clustered Policy Block (Score: {best_cluster_info['score']}, "
            f"Density: {best_cluster_info['policy_density']:.4f})"
        )
        result = [{'text': final_text, 'reason': reason_str}]
        return (result, flags)

    return ([], flags)

# ==============================================================================
# PART 4: MAIN CONTROLLER 
# ==============================================================================
def analyze_syllabus(file_path):
    """
    Analyzes a syllabus and returns a structured list with the result and binary flags.
    
    Returns:
        list: [course_code, policy_text, flag_multiple_clusters, flag_llm_failure]
    """
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(file_path)} {'='*20}")
    
    flag_multiple_clusters = 0
    flag_llm_failure = 0

    ext = os.path.splitext(file_path)[1].lower()
    paragraphs = []
    if ext == '.doc': paragraphs = extract_paragraphs_from_doc(file_path)
    elif ext == '.docx': paragraphs = extract_paragraphs_from_docx(file_path)
    elif ext == '.pdf': paragraphs = extract_paragraphs_from_pdf(file_path)
    else:
        print(f"Error: Unsupported file type '{ext}'.")
        return [None, None, 0, 0]

    if not paragraphs:
        print("Could not extract any usable text.")
        return [None, None, 0, 0]
    
    print(f"Extracted {len(paragraphs)} distinct paragraphs. Analyzing with clustering engine...")
    course_code = find_course_code_rule_based(paragraphs)
    
    # ADDED: determine department & knowledge area
    if course_code:
        prefix = course_code.split()[0]
        department = CODE_TO_DEPT.get(prefix, 'Unknown')
        knowledge_area = KNOWLEDGE_AREA_MAP.get(department, 'Unknown')
    else:
        department = 'Unknown'
        knowledge_area = 'Unknown'
    #############
    
    ai_policy_sections, cluster_flags = analyze_policy_with_clustering(paragraphs)
    if 'MULTIPLE_CLUSTERS' in cluster_flags:
        flag_multiple_clusters = 1

    policy_text = None
    if ai_policy_sections:
        policy_block = ai_policy_sections[0]['text']
        max_word_count = 150

        if len(policy_block.split()) > max_word_count:
            print(f"INFO: Policy block is long ({len(policy_block.split())} words). Engaging LLM.")
            refined_sections, llm_flags = refine_policy_with_ollama(policy_block)

            if llm_flags: # This means the list is not empty, indicating a failure
                flag_llm_failure = 1
                policy_text = None 
            else:
                if refined_sections:
                    policy_text = refined_sections[0]['text']
        else:
            print("INFO: Clustered policy block passed quality checks.")
            policy_text = ai_policy_sections[0]['text']
    
    # flag = 1 if policy_text else 0
    
    print("--- Analysis processing complete. ---")
    return [course_code, department, knowledge_area, policy_text, flag_multiple_clusters, flag_llm_failure]

In [13]:
# Use file folder, the folder name is test, you can change it
if __name__ == '__main__':
    results = []
    for fname in os.listdir('test'):
        path = os.path.join('test', fname)
        res = analyze_syllabus(path)
        if res and res[0]:
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
    csv_file = 'syllabus_summary.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
        ])
        writer.writeheader()
        writer.writerows(results)
    print(f"Summary CSV written to {csv_file}")


Successfully extracted paragraphs using coordinate-based method.
Extracted 31 distinct paragraphs. Analyzing with clustering engine...
INFO: Clustered policy block passed quality checks.
--- Analysis processing complete. ---

Extracted 101 distinct paragraphs. Analyzing with clustering engine...
--- Analysis processing complete. ---

Extracted 197 distinct paragraphs. Analyzing with clustering engine...
--- Analysis processing complete. ---

Extracted 182 distinct paragraphs. Analyzing with clustering engine...
INFO: Policy block is long (422 words). Engaging LLM.

--- Handing off to Ollama model 'deepseek-r1:1.5b' to refine the policy block... ---
INFO: Attempt 1 of 3 to query LLM...
WARN: Attempt 1 failed. Normalized quote not in context (likely a summary).
INFO: Attempt 2 of 3 to query LLM...
WARN: Attempt 2 failed. LLM did not use required separators.
INFO: Attempt 3 of 3 to query LLM...
WARN: Attempt 3 failed. Normalized quote not in context (likely a summary).
--- Analysis proce

In [5]:
# Enter the file name by yourself
if __name__ == '__main__':
    results = []
    files = [
        '8C2WFwQPrq6vcFU0Yml3vHcL1CYb5HnYcxp5s7A5.docx',
        'UMeRwRyvXAKAH6DwCkTmFyIunq3ti97bJPARlu7C.docx',
        '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf',
        '7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc'
    ]
    for fname in files:
        res = analyze_syllabus(fname)
        if res and res[0]:
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
    csv_file = 'syllabus_summary.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
        ])
        writer.writeheader()
        writer.writerows(results)
    print(f"Summary CSV written to {csv_file}")


Extracted 197 distinct paragraphs. Analyzing with clustering engine...
--- Analysis processing complete. ---

Extracted 182 distinct paragraphs. Analyzing with clustering engine...
INFO: Policy block is long (422 words). Engaging LLM.

--- Handing off to Ollama model 'deepseek-r1:1.5b' to refine the policy block... ---
INFO: Attempt 1 of 3 to query LLM...
WARN: Attempt 1 failed. Normalized quote not in context (likely a summary).
INFO: Attempt 2 of 3 to query LLM...
WARN: Attempt 2 failed. Normalized quote not in context (likely a summary).
INFO: Attempt 3 of 3 to query LLM...
WARN: Attempt 3 failed. Normalized quote not in context (likely a summary).
--- Analysis processing complete. ---

Successfully extracted paragraphs using coordinate-based method.
Extracted 31 distinct paragraphs. Analyzing with clustering engine...
INFO: Clustered policy block passed quality checks.
--- Analysis processing complete. ---

Extracted 101 distinct paragraphs. Analyzing with clustering engine...
---

In [6]:
pdf_file = "0GIzUorcaUlFtsvHBQmWIgLxidFQogW1Q3jaQJlX.pdf" 
analyze_syllabus(pdf_file)


Successfully extracted paragraphs using coordinate-based method.
Extracted 46 distinct paragraphs. Analyzing with clustering engine...
INFO: Found 2 distinct AI-related clusters. Flagging for review.
INFO: Policy block is long (366 words). Engaging LLM.

--- Handing off to Ollama model 'deepseek-r1:1.5b' to refine the policy block... ---
INFO: Attempt 1 of 3 to query LLM...
WARN: Attempt 1 failed. Normalized quote not in context (likely a summary).
INFO: Attempt 2 of 3 to query LLM...
INFO: LLM refinement successful and passed verification.
--- Analysis processing complete. ---


['ENGL 1901R',
 'Unknown',
 'Unknown',
 'Dear Students, As we delve into the complexities of literary theory this semester, I want to emphasize the importance of engaging deeply with the material and developing your own critical insights. While tools like ChatGPT can provide useful information and summaries, they are not a substitute for your own analytical thinking and original interpretation. Literary theory involves nuanced arguments, diverse perspectives, and sophisticated analysis that go beyond surface-level responses. Relying too heavily on AI tools can lead to superficial understanding and prevent you from cultivating your own voice and critical skills. Instead, I encourage you to approach your studies with curiosity and rigor. Read the primary texts, engage with scholarly debates, and participate actively in class discussions. Your unique perspectives and interpretations are what will truly enrich your understanding of the material and contribute to your growth as a scholar. I

In [None]:
pdf_file = "DIoCCJg4XeH5rJ0HuN9MqZiRNenyFUxa121Il04r.pdf" 
analyze_syllabus(pdf_file)

In [None]:
pdf_file = "jnlQIFYUgZhHZIZyILiFAEr02KEuyWJYkquXUvJD.pdf" 
analyze_syllabus(pdf_file)

In [None]:
pdf_file = "PeB4sO9tTTZxAYrxncoLPKoAIOtRa1ew5spr4lCw.pdf" 
analyze_syllabus(pdf_file)

In [None]:
docx_file = "HNPK3m8hfi0bcJ1sSa2LHeMNQ3DAxjJBXkSTtomA.docx" 
analyze_syllabus(docx_file)

In [None]:
docx_file = "T8hj3wlKrzFUjYtz61tDDFxzIIW5B7MFAdgNzy4X.docx" 
analyze_syllabus(docx_file)

In [None]:
def debug_clusters_in_file(file_path):
    """
    An independent function to analyze a file, find all AI-related text clusters,
    and print them with their scores for debugging and inspection.
    This function does NOT proceed to LLM refinement.
    """
    print(f"\n{'='*20} Debugging Clusters for: {os.path.basename(file_path)} {'='*20}")
    
    # --- Step 1: Extract Text  
    if not os.path.exists(file_path):
        print("Error: File not found.")
        return

    ext = os.path.splitext(file_path)[1].lower()
    paragraphs = []
    if ext == '.doc': paragraphs = extract_paragraphs_from_doc(file_path)
    elif ext == '.docx': paragraphs = extract_paragraphs_from_docx(file_path)
    elif ext == '.pdf': paragraphs = extract_paragraphs_from_pdf(file_path)
    else:
        print(f"Error: Unsupported file type '{ext}'.")
        return

    if not paragraphs:
        print("Could not extract any usable text from the file.")
        return
        
    print(f"Successfully extracted {len(paragraphs)} paragraphs.")

    # --- Step 2: Find all clusters and their details 
    triggers = []
    for i, para in enumerate(paragraphs):
        is_header = HEADER_PATTERN.match(para) and len(para.split()) < 15
        contains_ai = AI_TRIGGER_PATTERN.search(para)
        contains_policy = POLICY_PATTERN.search(para)
        if is_header:
            triggers.append({'index': i, 'type': 'header', 'weight': 10}) 
        elif contains_ai and contains_policy:
            triggers.append({'index': i, 'type': 'strong_mention', 'weight': 3})
        elif contains_ai:
            triggers.append({'index': i, 'type': 'weak_mention', 'weight': 1})
    
    if not triggers:
        print("\n--- No AI-related triggers found in this document. ---")
        return

    clusters = []
    if triggers:
        current_cluster = [triggers[0]]
        for i in range(1, len(triggers)):
            if triggers[i]['index'] - current_cluster[-1]['index'] <= 3:
                current_cluster.append(triggers[i])
            else:
                clusters.append(current_cluster)
                current_cluster = [triggers[i]]
        clusters.append(current_cluster)
        
    if not clusters:
        print("\n--- Could not form any clusters from the triggers. ---")
        return

    # --- Step 3: Calculate scores and format for printing
    all_clusters_details = []
    for cluster in clusters:
        min_index = min(t['index'] for t in cluster)
        max_index = max(t['index'] for t in cluster)
        has_header = any(t['type'] == 'header' for t in cluster) or (min_index > 0 and HEADER_PATTERN.match(paragraphs[min_index - 1]))
        score = sum(t['weight'] for t in cluster) + (20 if has_header else 0)
        
        start_index = min_index - 1 if has_header and not any(t['type'] == 'header' for t in cluster) else min_index
        end_index = min(len(paragraphs), max_index + 1)
        while end_index < len(paragraphs) and len(paragraphs[end_index].split()) > 5:
            end_index += 1
        
        cluster_text = "\n\n".join(paragraphs[start_index:end_index])
        all_clusters_details.append({'score': score, 'text': cluster_text})

    # --- Step 4: Print the results beautifully 
    print(f"\n--- Found {len(all_clusters_details)} AI-related cluster(s). Details below: ---")
    sorted_clusters = sorted(all_clusters_details, key=lambda x: x['score'], reverse=True)
    
    for i, cluster_info in enumerate(sorted_clusters, 1):
        print(f"\n--- Cluster #{i} (Score: {cluster_info['score']}) ---")
        print(cluster_info['text'])
        print("-" * (25 + len(str(i)) + len(str(cluster_info['score']))))


pdf_file = "0GIzUorcaUlFtsvHBQmWIgLxidFQogW1Q3jaQJlX.pdf" 
debug_clusters_in_file(pdf_file)

In [None]:
pdf_file = "DIoCCJg4XeH5rJ0HuN9MqZiRNenyFUxa121Il04r.pdf" 
debug_clusters_in_file(pdf_file)

In [None]:
pdf_file = "jnlQIFYUgZhHZIZyILiFAEr02KEuyWJYkquXUvJD.pdf" 
debug_clusters_in_file(pdf_file)