In [None]:
import re
import os
import sys
import csv
import time
from docx import Document
import pdfplumber
import ollama

# --- For Windows users... ---
if sys.platform == 'win32':
    try: import win32com.client as win32
    except ImportError: print("Warning: .doc support disabled."); win32 = None
else: win32 = None

# ================================================================================
# PART 0: DEPARTMENT & KNOWLEDGE AREA MAPPINGS
# ================================================================================
DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN', 'NAIS', 'PHUM', 'STS'],
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Archaeology and the Ancient World': 'ARCH',
    'Bio-Medical (PLME & MED)': 'MED',
    'Biology': 'BIOL',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Center for Language Studies': ['ARAB', 'EINT', 'HNDI', 'LANG', 'PRSN', 'SIGN', 'TKSH', 'YORU'],
    'Chemistry': 'CHEM',
    'Classics': ['CLAS', 'CREK', 'LATN', 'MGRK', 'SANS'],
    'Cognitive, Linguistic, and Psychological Sciences': ['CLPS', 'LING'],
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Data Science Initiative': ['DATA', 'DSIO'],
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': ['CHIN', 'EAST', 'JAPN', 'KREA', 'VIET'],
    'Economics': 'ECON',
    'Education': 'EDUC',
    'Egyptology and Assyriology': ['ASYR', 'EGYT'],
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'Environmental Studies': 'ENVS',
    'French Studies': 'FREN',
    'German Studies': 'GRMN',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'Italian Studies': 'ITAL',
    'Judaic Studies': ['HEBR', 'JUDS'],
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Music': 'MUSC',
    'Neuroscience': 'NEUR',
    'Pembroke Center Teach and Rearch': 'GNSS',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': ['BHDS', 'GPHP', 'HCL', 'PHP'],
    'Religious Studies': ['COST', 'RELS'],
    'Slavic Studies': ['CZCH','PLSH','RUSS', 'SLAV'],
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Urban Studies': 'URBN',
    'Visual Art': 'VISA',
    'Watson Institute': ['IAPA', 'MPA'],
}
# reverse lookup: code >> dept name
CODE_TO_DEPT = {}
for dept, codes in DEPARTMENT_MAP.items():
    if isinstance(codes, list):
        for c in codes:
            CODE_TO_DEPT[c] = dept
    else:
        CODE_TO_DEPT[codes] = dept
# map each department to a knowledge area

KNOWLEDGE_AREA_MAP = {
    'Africana Studies': 'Social Sciences',
    'American Studies': 'Social Sciences',
    'Anthropology': 'Social Sciences',
    'Applied Mathematics': 'Physical Sciences',
    'Archaeology and the Ancient World': 'Humanities',
    'Bio-Medical (PLME & MED)': 'Life Sciences',
    'Biology': 'Life Sciences',
    'Brown Arts Institute': 'Arts',
    'Business, Entrepreneurship, Organizations': 'Social Sciences',
    'Center for Language Studies': 'Humanities',
    'Chemistry': 'Physical Sciences',
    'Classics': 'Humanities',
    'Cognitive and Psychological Sciences': 'Life Sciences',
    'Cogut Institute for the Humanities': 'Humanities',
    'Comparative Literature': 'Humanities',
    'Computer Science': 'Physical Sciences',
    'Data Science Initiative': 'Physical Sciences',
    'Early Modern World': 'Humanities',
    'Earth, Environmental and Planetary Sciences': 'Physical Sciences',
    'East Asian Studies': 'Humanities',
    'Economics': 'Social Sciences',
    'Education': 'Social Sciences',
    'Egyptology and Assyriology': 'Humanities',
    'Engineering': 'Physical Sciences',
    'English': 'Humanities',
    'Environmental Studies': 'Physical Sciences',
    'French Studies': 'Humanities',
    'German Studies': 'Humanities',
    'Hispanic Studies': 'Humanities',
    'History': 'Social Sciences',
    'History of Art and Architecture': 'Humanities',
    'Italian Studies': 'Humanities',
    'Judaic Studies': 'Humanities',
    'Linguistics': 'Social Sciences',
    'Literary Arts': 'Humanities',
    'Mathematics': 'Physical Sciences',
    'Medieval Studies': 'Humanities',
    'Middle East Studies': 'Humanities',
    'Modern Culture and Media': 'Humanities',
    'Music': 'Humanities',
    'Neuroscience': 'Life Sciences',
    'Pembroke Center Teach and Rearch': 'Humanities',
    'Philosophy': 'Humanities',
    'Physics': 'Physical Sciences',
    'Political Science': 'Social Sciences',
    'Portuguese and Brazilian Studies': 'Humanities',
    'Public Health': 'Life Sciences',
    'Religious Studies': 'Humanities',
    'Slavic Studies': 'Humanities',
    'Sociology': 'Social Sciences',
    'Theatre Arts and Performance Studies': 'Humanities',
    'Urban Studies': 'Social Sciences',
    'Visual Art': 'Humanities',
    'Watson Institute': 'Social Sciences',
}

# ==============================================================================
# PART 1: DEFINITIONS
# ==============================================================================
AI_CONTEXT_WORDS = ['ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot', 'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney', 'stable diffusion', 'ai tool', 'ai tools']
POLICY_KEYWORDS = ['academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized', 'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution', 'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure', 'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance', 'policy', 'rule']
COURSE_CODE_MAP = {
    'Africana Studies': 'AFRI',
    'American Sign Language': 'SIGN',
    'American Studies': 'AMST',
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Arabic': 'ARAB',
    'Archaeology and the Ancient World': 'ARCH',
    'Assyriology': 'ASYR',
    'Behavioral and Social Health Sciences': 'BHDS',
    'Biology': 'BIOL',
    'Bio-Medical (PLME & MED)': 'MED',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Chemistry': 'CHEM',
    'Chinese': 'CHIN',
    'Classical Greek': 'CREK',
    'Classics': 'CLAS',
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Contemplative Studies': 'COST', 
    'Czech': 'CZCH',
    'Data Science': 'DATA',
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': 'EAST',
    'Economics': 'ECON', 
    'Education': 'EDUC',
    'Egyptology': 'EGYT',
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'English for International Students': 'EINT', 
    'Environmental Studies': 'ENVS',
    'Ethnic Studies': 'ETHN',
    'French': 'FREN',
    'Gender and Sexuality Studies': 'GNSS', 
    'German': 'GRMN',
    'Global Public Health': 'GPHP',
    'Health Care Leadership': 'HCL', 
    'Hebrew': 'HEBR',
    'Hindi': 'HNDI',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'International and Public Affairs': 'IAPA', 
    'Italian': 'ITAL',
    'Japanese': 'JAPN',
    'Judaic Studies': 'JUDS',
    'Korean': 'KREA',
    'Language Studies': 'LANG', 
    'Latin': 'LATN',
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Master of Public Affairs': 'MPA', 
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Modern Greek': 'MGRK',
    'Music': 'MUSC',
    'Native American and Indigenous Studies': 'NAIS',
    'Neuroscience': 'NEUR',
    'Persian': 'PRSN',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Polish': 'PLSH',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': 'PHP',
    'Public Humanities': 'PHUM',
    'Religious Studies': 'RELS',
    'Russian': 'RUSS',
    'Sanskrit': 'SANS',
    'Science, Technology, and Society': 'STS',
    'Slavic Studies': 'SLAV',
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Turkish': 'TKSH',
    'Urban Studies': 'URBN',
    'Vietnamese': 'VIET',
    'Visual Art': 'VISA',
    'Yoruba': 'YORU'
}

HEADER_PATTERN = re.compile(
    r'.*\b('
    r'(ai|artificial\sintelligence|generative\s+ai)\s+.*\bpolicy'
    r'|'
    r'policy\s+.*\b(ai|artificial\sintelligence|generative\s+ai)'
    r')\b.*',
    re.IGNORECASE
)
AI_TRIGGER_PATTERN = re.compile(r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b', re.IGNORECASE)
POLICY_PATTERN = re.compile(r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b', re.IGNORECASE)

# ==============================================================================
# PART 2: TEXT EXTRACTION 
# ==============================================================================

def extract_paragraphs_from_doc(doc_path):
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        return []
        
    word = None; doc = None
    
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(doc_path))
        return [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
        
    except Exception as e:
        print(f"Error processing .doc file with MS Word: {e}")
        return []
        
    finally:
        if doc: 
            try:
                doc.Close(False)
            except Exception as e_close:
                print(f"  - Warning: Failed to close document object: {e_close}")

        if word: 
            try:
                word.Quit()
            except Exception as e_quit:
                print(f"  - Warning: Failed to quit Word application: {e_quit}")
        wait_time = 5 # when it falls, this number can be set larger.
        print(f"  - Pausing for {wait_time} seconds to ensure Word process cleanup...")
        time.sleep(wait_time)
        
def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path)
        blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        blocks.append(cell.text)
        return blocks
    except Exception as e:
        print(f"Error reading DOCX file {doc_path}: {e}")
        return []

def _reconstruct_paragraphs_from_page(page):
    words = page.extract_words(keep_blank_chars=False, x_tolerance=2)
    if not words: return []
    
    lines = {}
    for word in words:
        line_top = round(word['top'], 2)
        if line_top not in lines:
            lines[line_top] = []
        lines[line_top].append(word)

    for line_top in lines:
        lines[line_top].sort(key=lambda w: w['x0'])
        
    sorted_lines = sorted(lines.items(), key=lambda item: item[0])
    
    reconstructed_lines = []
    line_heights = []
    last_top = None
    for top, words_in_line in sorted_lines:
        text = " ".join(w['text'] for w in words_in_line)
        reconstructed_lines.append({'top': top, 'text': text})
        if last_top is not None:
            line_heights.append(top - last_top)
        last_top = top

    if not reconstructed_lines: return []

    avg_line_height = sum(line_heights) / len(line_heights) if line_heights else 12
    paragraph_break_threshold = avg_line_height * 1.5
    
    page_paragraphs = []
    current_paragraph = reconstructed_lines[0]['text']
    for i in range(1, len(reconstructed_lines)):
        prev_line, curr_line = reconstructed_lines[i-1], reconstructed_lines[i]
        if (curr_line['top'] - prev_line['top']) > paragraph_break_threshold:
            page_paragraphs.append(current_paragraph)
            current_paragraph = curr_line['text']
        else:
            current_paragraph += " " + curr_line['text']
    page_paragraphs.append(current_paragraph)
    
    return page_paragraphs

def extract_paragraphs_from_pdf(pdf_path):
    all_paragraphs = []
    carry_over_paragraph = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                paragraphs_on_page = _reconstruct_paragraphs_from_page(page)
                if not paragraphs_on_page: continue
                if carry_over_paragraph:
                    paragraphs_on_page[0] = carry_over_paragraph + " " + paragraphs_on_page[0]
                    carry_over_paragraph = ""
                last_para = paragraphs_on_page[-1]
                if not last_para.strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry_over_paragraph = paragraphs_on_page.pop()
                all_paragraphs.extend(paragraphs_on_page)
        if carry_over_paragraph:
            all_paragraphs.append(carry_over_paragraph)
        print("Successfully extracted paragraphs using coordinate-based method.")
        return all_paragraphs
    except Exception as e:
        print(f"Coordinate-based parsing failed: {e}. No OCR fallback implemented.")
        return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================

def find_course_code_rule_based(paragraphs, search_limit=30):

    for long_name, short_code in COURSE_CODE_MAP.items():
        pattern = re.compile(
            r'\b' + re.escape(long_name) +
            r'\s+' +
            r'(\d{3,4}[a-zA-Z]?)\b',
            re.IGNORECASE
        )
        for para in paragraphs[:search_limit]:
            if match := pattern.search(para):
                return f"{short_code} {match.group(1).upper()}"

    EXCLUDE_TERMS = ['FALL', 'SPRING', 'WINTER', 'SUMMER', 'ROOM', 'TO', 'AT', 'THE']
    exclude_pattern = '|'.join(EXCLUDE_TERMS)

    num_pattern_ext = r'(\d{3,4}[a-zA-Z]?(?:\s*\/\s*\d{3,4}[a-zA-Z]?)?)'
    
    all_caps_pattern = re.compile(
        r'\b' +
        r'(?!' + exclude_pattern + r'\b)' +
        r'([A-Z]{2,4}(?:\s*\/\s*[A-Z]{2,4})*)' +
        r'\s*-?\s*' +
        num_pattern_ext + r'\b'
    )
    compound_pattern = re.compile(
        r'\b(([A-Z]{2,4}\d{3,4}[a-zA-Z]?)(?:\s*/\s*[A-Z]{2,4}\d{3,4}[a-zA-Z]?)+)\b'
    )
    mixed_case_pattern = re.compile(
        r'\b([A-Z][a-z]{1,3})' +
        num_pattern_ext + r'\b'
    )
    
    for para in paragraphs[:search_limit]:
        found_high_priority = []
        
        compound_match = compound_pattern.search(para)
        caps_match = all_caps_pattern.search(para)

        if compound_match:
            raw_code = compound_match.group(1).upper()
            parts = [p.strip() for p in raw_code.split('/')]
            formatted_parts = [re.sub(r'([A-Z]+)(\d)', r'\1 \2', p, 1) for p in parts]
            found_high_priority.append({'start': compound_match.start(), 'code': '/'.join(formatted_parts)})

        if caps_match and (not compound_match or caps_match.start() != compound_match.start()):
            dept = re.sub(r'\s*\/\s*', '/', caps_match.group(1))
            if dept.upper() not in EXCLUDE_TERMS:
                 num = re.sub(r'\s*\/\s*', '/', caps_match.group(2).upper())
                 found_high_priority.append({'start': caps_match.start(), 'code': f"{dept} {num}"})
        
        if found_high_priority:
            earliest_match = min(found_high_priority, key=lambda x: x['start'])
            return earliest_match['code']


    for para in paragraphs[:search_limit]:
        if match := mixed_case_pattern.search(para):
            dept = match.group(1).upper()
            num = re.sub(r'\s*\/\s*', '/', match.group(2).upper())
            return f"{dept} {num}"
    
    return None

def find_course_code_position_based(paragraphs, search_limit=30):
    
    patterns = []
    
    patterns.append({
        'name': 'compound',
        'pattern': re.compile(r'\b(([A-Z]{2,4}\d{3,4}[a-zA-Z]?)(?:\s*/\s*[A-Z]{2,4}\d{3,4}[a-zA-Z]?)+)\b')
    })

    for long_name, short_code in COURSE_CODE_MAP.items():
        patterns.append({
            'name': 'long_name',
            'pattern': re.compile(r'\b' + re.escape(long_name) + r'\s+(\d{3,4}[a-zA-Z]?)\b', re.IGNORECASE),
            'short_code': short_code
        })
    
    EXCLUDE_TERMS = ['FALL', 'SPRING', 'WINTER', 'SUMMER', 'ROOM', 'TO', 'AT', 'THE']
    exclude_pattern_str = '|'.join(EXCLUDE_TERMS)
    num_pattern_ext = r'(\d{3,4}[a-zA-Z]?(?:\s*\/\s*\d{3,4}[a-zA-Z]?)?)'
    
    patterns.append({
        'name': 'all_caps',
        'pattern': re.compile(
            r'\b(?!' + exclude_pattern_str + r'\b)'
            r'([A-Z]{2,4}(?:\s*\/\s*[A-Z]{2,4})*)'
            r'\s*-?\s*' + num_pattern_ext + r'\b'
        )
    })
    
    patterns.append({
        'name': 'mixed_case',
        'pattern': re.compile(r'\b([A-Z][a-z]{1,3})' + num_pattern_ext + r'\b')
    })

    for para in paragraphs[:search_limit]:
        found_matches = []
        for p_info in patterns:
            for match in p_info['pattern'].finditer(para):
                found_matches.append({
                    'start': match.start(), 'end': match.end(),
                    'pattern_name': p_info['name'], 'match_obj': match,
                    'short_code': p_info.get('short_code')
                })
        
        if not found_matches: continue
            
        final_candidates = []
        for i, m1 in enumerate(found_matches):
            is_submatch = False
            for j, m2 in enumerate(found_matches):
                if i == j: continue
                if m2['start'] <= m1['start'] and m2['end'] >= m1['end'] and (m2['end']-m2['start'] > m1['end']-m1['start']):
                    is_submatch = True
                    break
            if not is_submatch:
                final_candidates.append(m1)

        if not final_candidates: continue

        earliest_match_info = min(final_candidates, key=lambda x: x['start'])
        match = earliest_match_info['match_obj']
        name = earliest_match_info['pattern_name']

        if name == 'compound':
            raw_code = match.group(1).upper()
            parts = [p.strip() for p in raw_code.split('/')]
            formatted_parts = [re.sub(r'([A-Z]+)(\d)', r'\1 \2', p, 1) for p in parts]
            return '/'.join(formatted_parts)
        elif name == 'long_name':
            return f"{earliest_match_info['short_code']} {match.group(1).upper()}"
        elif name == 'all_caps':
            dept = re.sub(r'\s*\/\s*', '/', match.group(1))
            num = re.sub(r'\s*\/\s*', '/', match.group(2).upper())
            return f"{dept} {num}"
        elif name == 'mixed_case':
            dept = match.group(1).upper()
            num = re.sub(r'\s*\/\s*', '/', match.group(2).upper())
            return f"{dept} {num}"

    return None



def _normalize_for_comparison(text: str) -> str:
    """
    Converts text to a 'canonical' form for robust comparison by lowercasing
    and removing all non-alphanumeric characters.
    """
    return "".join(char.lower() for char in text if char.isalnum())

def refine_policy_with_ollama(context_block, model_name="deepseek-r1:14b"):
    """
    Asks the LLM to extract the core policy with retries, progressive prompting,
    and robust, normalized verbatim quote verification.
    Returns:
        tuple: (result, flags)
    """
    print(f"\n--- Handing off to Ollama model '{model_name}' to refine the policy block... ---")
    
    SEPARATOR_START = "[---POLICY_TEXT_START---]"
    SEPARATOR_END = "[---POLICY_TEXT_END---]"
    MAX_RETRIES = 3

    prompts = [
        f"""Analyze the syllabus section to find the policy on "Artificial Intelligence".
Instructions:
1. Provide a brief, one-sentence explanation.
2. Output the start separator: {SEPARATOR_START}
3. Quote the complete AI policy verbatim.
4. Output the end separator: {SEPARATOR_END}
If no policy is found, respond ONLY with "None".
Syllabus Section:\n---\n{context_block}\n---""",

        f"""RE-EVALUATION: The previous analysis was likely wrong. The text IS KNOWN to contain a policy on "Artificial Intelligence". Locate and extract it.
Instructions:
1. Explain your corrected finding in one sentence.
2. Output the start separator: {SEPARATOR_START}
3. Quote the policy verbatim. DO NOT summarize.
4. Output the end separator: {SEPARATOR_END}
Do not respond "None". Find the policy.
Syllabus Section:\n---\n{context_block}\n---""",

        f"""FINAL ATTEMPT: You MUST extract the policy about "Artificial Intelligence". It is there. Your task is to EXTRACT it, not to decide if it exists.
Instructions:
1. Find the rules for using AI.
2. Output: {SEPARATOR_START}
3. Copy the paragraph(s) exactly.
4. Output: {SEPARATOR_END}
Extract it now.
Syllabus Section:\n---\n{context_block}\n---"""
    ]
    
    normalized_context = _normalize_for_comparison(context_block)

    try:
        for attempt in range(MAX_RETRIES):
            current_prompt = prompts[attempt]
            print(f"INFO: Attempt {attempt + 1} of {MAX_RETRIES} to query LLM...")
            
            response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': current_prompt}])
            llm_response = response['message']['content'].strip()

            if (llm_response.strip().upper() == "NONE"):
                print(f"WARN: Attempt {attempt + 1} failed. LLM responded 'None'.")
                if attempt < MAX_RETRIES - 1: time.sleep(1)
                continue

            if SEPARATOR_START in llm_response and SEPARATOR_END in llm_response:
                after_start = llm_response.split(SEPARATOR_START, 1)[1]
                policy_text = after_start.split(SEPARATOR_END, 1)[0].strip()
                
                if not policy_text:
                     print(f"WARN: Attempt {attempt + 1} failed. LLM returned an empty policy.")
                     if attempt < MAX_RETRIES - 1: time.sleep(1)
                     continue

                normalized_policy = _normalize_for_comparison(policy_text)
                if normalized_policy not in normalized_context:
                    print(f"WARN: Attempt {attempt + 1} failed. Normalized quote not in context (likely a summary).")
                    if attempt < MAX_RETRIES - 1: time.sleep(1)
                    continue

                print("INFO: LLM refinement successful and passed verification.")
                result = [{'text': policy_text, 'reason': f'Refined by {model_name} (Attempt {attempt+1})'}]
                return (result, [])
            else:
                print(f"WARN: Attempt {attempt + 1} failed. LLM did not use required separators.")
                if attempt < MAX_RETRIES - 1: time.sleep(1)
        
        return ([], ['LLM_FINAL_FAILURE'])
            
    except Exception as e:
        print(f"--- ERROR: Could not connect to Ollama: {e} ---")
        return ([], ['LLM_CONNECTION_ERROR'])

def analyze_policy_with_clustering(paragraphs):
    """
    Clusters mentions, finds the best block using a two-tier scoring system,
    and returns it along with a flag if multiple clusters were found.
    Returns:
        tuple: (result, flags)
    """
    flags = []
    triggers = []
    
    for i, para in enumerate(paragraphs):
        is_header = HEADER_PATTERN.match(para) and len(para.split()) < 15
        contains_ai = AI_TRIGGER_PATTERN.search(para)
        contains_policy = POLICY_PATTERN.search(para)
        if is_header:
            triggers.append({'index': i, 'type': 'header', 'weight': 10}) 
        elif contains_ai and contains_policy:
            triggers.append({'index': i, 'type': 'strong_mention', 'weight': 3})
        elif contains_ai:
            triggers.append({'index': i, 'type': 'weak_mention', 'weight': 1})
    
    if not triggers:
        return ([], [])

    clusters = []
    if triggers:
        current_cluster = [triggers[0]]
        for i in range(1, len(triggers)):
            if triggers[i]['index'] - current_cluster[-1]['index'] <= 3:
                current_cluster.append(triggers[i])
            else:
                clusters.append(current_cluster)
                current_cluster = [triggers[i]]
        clusters.append(current_cluster)

    if len(clusters) > 1:
        print(f"INFO: Found {len(clusters)} distinct AI-related clusters. Flagging for review.")
        flags.append('MULTIPLE_CLUSTERS')
    
    # === MODIFICATION START: Two-tier scoring system ===
    best_cluster_info = {'score': -1, 'policy_density': -1, 'block': []}
    
    for cluster in clusters:
        min_index = min(t['index'] for t in cluster)
        max_index = max(t['index'] for t in cluster)
        
        # --- Primary Score Calculation ---
        has_header = any(t['type'] == 'header' for t in cluster) or (min_index > 0 and HEADER_PATTERN.match(paragraphs[min_index - 1]))
        score = sum(t['weight'] for t in cluster) + (20 if has_header else 0)
        
        # --- Tie-breaker (Secondary Score) Calculation ---
        start_idx_for_text = min_index - 1 if has_header and not any(t['type'] == 'header' for t in cluster) else min_index
        end_idx_for_text = min(len(paragraphs), max_index + 1)
        cluster_block_text = "\n\n".join(paragraphs[start_idx_for_text:end_idx_for_text])
        num_policy_words = len(POLICY_PATTERN.findall(cluster_block_text))
        policy_density = num_policy_words / (len(cluster_block_text.split()) + 1e-6) # Add epsilon to avoid division by zero
        
        # --- Update Best Cluster based on two-tier logic ---
        is_best = False
        if score > best_cluster_info['score']:
            is_best = True
        elif score == best_cluster_info['score'] and policy_density > best_cluster_info['policy_density']:
            print(f"INFO: Tie-breaker activated. New cluster with density {policy_density:.4f} is better than previous {best_cluster_info['policy_density']:.4f}.")
            is_best = True
        
        if is_best:
            start_index = start_idx_for_text
            end_index = end_idx_for_text
            while end_index < len(paragraphs) and len(paragraphs[end_index].split()) > 5:
                end_index += 1
                
            best_cluster_info = {
                'score': score, 
                'policy_density': policy_density, 
                'block': paragraphs[start_index:end_index]
            }
    # === MODIFICATION END ===

    if best_cluster_info['block']:
        final_text = "\n\n".join(best_cluster_info['block'])
        # Add policy_density to the reason for better debugging
        reason_str = (
            f"Clustered Policy Block (Score: {best_cluster_info['score']}, "
            f"Density: {best_cluster_info['policy_density']:.4f})"
        )
        result = [{'text': final_text, 'reason': reason_str}]
        return (result, flags)

    return ([], flags)

# ==============================================================================
# PART 4: MAIN CONTROLLER 
# ==============================================================================
def analyze_syllabus(file_path, rule_based = True):
    """
    Analyzes a syllabus and returns a structured list with the result and binary flags.
    
    Returns:
        list: [course_code, policy_text, flag_multiple_clusters, flag_llm_failure]
    """
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(file_path)} {'='*20}")
    
    flag_multiple_clusters = 0
    flag_llm_failure = 0

    ext = os.path.splitext(file_path)[1].lower()
    paragraphs = []
    if ext == '.doc': paragraphs = extract_paragraphs_from_doc(file_path)
    elif ext == '.docx': paragraphs = extract_paragraphs_from_docx(file_path)
    elif ext == '.pdf': paragraphs = extract_paragraphs_from_pdf(file_path)
    else:
        print(f"Error: Unsupported file type '{ext}'.")
        return [None,'Unknown','Unknown', None, 0, 0]

    if not paragraphs:
        print("Could not extract any usable text.")
        return [None,'Unknown','Unknown', None, 0, 0]
    
    print(f"Extracted {len(paragraphs)} distinct paragraphs. Analyzing with clustering engine...")
    if rule_based:
        course_code = find_course_code_rule_based(paragraphs)
    else:
        course_code = find_course_code_position_based(paragraphs)
    # ADDED: determine department & knowledge area
    if course_code:
        prefix = course_code.split()[0]
        department = CODE_TO_DEPT.get(prefix, 'Unknown')
        knowledge_area = KNOWLEDGE_AREA_MAP.get(department, 'Unknown')
    else:
        department = 'Unknown'
        knowledge_area = 'Unknown'
    #############
    
    ai_policy_sections, cluster_flags = analyze_policy_with_clustering(paragraphs)
    if 'MULTIPLE_CLUSTERS' in cluster_flags:
        flag_multiple_clusters = 1

    policy_text = None
    if ai_policy_sections:
        policy_block = ai_policy_sections[0]['text']
        max_word_count = 150

        if len(policy_block.split()) > max_word_count:
            print(f"INFO: Policy block is long ({len(policy_block.split())} words). Engaging LLM.")
            refined_sections, llm_flags = refine_policy_with_ollama(policy_block)

            if llm_flags: # This means the list is not empty, indicating a failure
                flag_llm_failure = 1
                policy_text = None 
            else:
                if refined_sections:
                    policy_text = refined_sections[0]['text']
        else:
            print("INFO: Clustered policy block passed quality checks.")
            policy_text = ai_policy_sections[0]['text']
    
    # flag = 1 if policy_text else 0
    
    print("--- Analysis processing complete. ---")
    return [course_code, department, knowledge_area, policy_text, flag_multiple_clusters, flag_llm_failure]

In [None]:
if __name__ == '__main__':
    base_dir = 'AY 2023-2024'
    for i in range(1,16):
        folder_name = f'part {i}'
        dir = os.path.join(base_dir, folder_name)
        results = []
        for fname in os.listdir(dir):
            path = os.path.join(dir, fname)
            res = analyze_syllabus(path)
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'File': os.path.basename(fname),
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
            
        output_dir = f'Policy text/{folder_name}'
        os.makedirs(output_dir, exist_ok=True)  
        csv_file = f'Policy text/{folder_name}/syllabus_summary_1.csv'
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
            ])
            writer.writeheader()
            writer.writerows(results)
        print(f"Summary CSV written to {csv_file}")

In [None]:
if __name__ == '__main__':
    base_dir = 'AY 2023-2024'
    for i in range(1,16):
        folder_name = f'part {i}'
        dir = os.path.join(base_dir, folder_name)
        results = []
        for fname in os.listdir(dir):
            path = os.path.join(dir, fname)
            res = analyze_syllabus(path, rule_based=False)  # this line is different from other 4 attempts
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'File': os.path.basename(fname),
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
            
        output_dir = f'Policy text/{folder_name}'
        os.makedirs(output_dir, exist_ok=True)  
        csv_file = f'Policy text/{folder_name}/syllabus_summary_2.csv'
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
            ])
            writer.writeheader()
            writer.writerows(results)
        print(f"Summary CSV written to {csv_file}")

In [None]:
if __name__ == '__main__':
    base_dir = 'AY 2023-2024'
    for i in range(1,16):
        folder_name = f'part {i}'
        dir = os.path.join(base_dir, folder_name)
        results = []
        for fname in os.listdir(dir):
            path = os.path.join(dir, fname)
            res = analyze_syllabus(path)
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'File': os.path.basename(fname),
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
            
        output_dir = f'Policy text/{folder_name}'
        os.makedirs(output_dir, exist_ok=True)  
        csv_file = f'Policy text/{folder_name}/syllabus_summary_3.csv'
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
            ])
            writer.writeheader()
            writer.writerows(results)
        print(f"Summary CSV written to {csv_file}")

In [None]:
if __name__ == '__main__':
    base_dir = 'AY 2023-2024'
    for i in range(1,16):
        folder_name = f'part {i}'
        dir = os.path.join(base_dir, folder_name)
        results = []
        for fname in os.listdir(dir):
            path = os.path.join(dir, fname)
            res = analyze_syllabus(path)
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'File': os.path.basename(fname),
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
            
        output_dir = f'Policy text/{folder_name}'
        os.makedirs(output_dir, exist_ok=True)  
        csv_file = f'Policy text/{folder_name}/syllabus_summary_4.csv'
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
            ])
            writer.writeheader()
            writer.writerows(results)
        print(f"Summary CSV written to {csv_file}")

In [None]:
if __name__ == '__main__':
    base_dir = 'AY 2023-2024'
    for i in range(1,16):
        folder_name = f'part {i}'
        dir = os.path.join(base_dir, folder_name)
        results = []
        for fname in os.listdir(dir):
            path = os.path.join(dir, fname)
            res = analyze_syllabus(path)
            code, dept, area, policy, multi, llm_fail = res
            results.append({
                'File': os.path.basename(fname),
                'Course Code': code,
                'Department': dept,
                'Knowledge Area': area,
                'AI Policy': policy or '',
                'Multiple Clusters': multi,
                'LLM Failure': llm_fail
            })
            
        output_dir = f'Policy text/{folder_name}'
        os.makedirs(output_dir, exist_ok=True)  
        csv_file = f'Policy text/{folder_name}/syllabus_summary_5.csv'
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
            ])
            writer.writeheader()
            writer.writerows(results)
        print(f"Summary CSV written to {csv_file}")

In [None]:
# create a csv for 1 particular part

# if __name__ == '__main__':
#     base_dir = 'AY 2023-2024'
#     folder_name = 'part 14' 

#     dir = os.path.join(base_dir, folder_name)
#     results = []
    
#     for fname in os.listdir(dir):
#         path = os.path.join(dir, fname)
#         res = analyze_syllabus(path)
#         code, dept, area, policy, multi, llm_fail = res
#         results.append({
#             'File': os.path.basename(fname),
#             'Course Code': code,
#             'Department': dept,
#             'Knowledge Area': area,
#             'AI Policy': policy or '',
#             'Multiple Clusters': multi,
#             'LLM Failure': llm_fail
#         })
            
#     output_dir = f'Policy text/{folder_name}'
#     os.makedirs(output_dir, exist_ok=True)  
#     csv_file = f'Policy text/{folder_name}/syllabus_summary_1.csv'
#     with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
#         writer = csv.DictWriter(f, fieldnames=[
#             'File', 'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'Multiple Clusters', 'LLM Failure'
#         ])
#         writer.writeheader()
#         writer.writerows(results)
#     print(f"Summary CSV written to {csv_file}")