In [None]:
# --- Imports ---
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

# --- Load Term List CSV ---
df = pd.read_csv('communication_screen_terms_v2.csv')

# --- Constants & Helpers ---
def clean_terms(term_string):
    return [t.strip() for t in term_string.split(',') if t.strip()]

# --- Screening Function (deduplicated + grouped info) ---
def screen_text(input_text):
    results_by_pos = defaultdict(list)
    for _, row in df.iterrows():
        term_map = {
            'Primary': [row['Primary Term']],
            'Secondary': clean_terms(row['Secondary Terms']),
            'Tertiary': clean_terms(row['Tertiary Terms'])
        }
        for match_type, terms in term_map.items():
            for term in terms:
                for match in re.finditer(re.escape(term), input_text, re.IGNORECASE):
                    key = (match.group(0).lower(), match.start())
                    results_by_pos[key].append({
                        'Flagged Term': match.group(0),
                        'Match Type': match_type,
                        'Primary Term': row['Primary Term'],
                        'Category': row['Category'],
                        'Flag Reason': row['Flag Reason'],
                        'Executive Orders': row['Executive Orders'],
                        'Position': match.start()
                    })

    priority = {'Primary': 0, 'Secondary': 1, 'Tertiary': 2}
    grouped_results = []
    for matches in results_by_pos.values():
        matches_sorted = sorted(matches, key=lambda x: priority[x['Match Type']])
        best = matches_sorted[0]
        grouped_results.append({
            'Flagged Term': best['Flagged Term'],
            'Match Type': best['Match Type'],
            'Flag Reason': best['Flag Reason'],
            'Executive Orders': best['Executive Orders'],
            'Position': best['Position'],
            'Primary Terms': sorted(set(m['Primary Term'] for m in matches)),
            'Categories': sorted(set(m['Category'] for m in matches))
        })
    return sorted(grouped_results, key=lambda x: x['Position'])

# --- Highlighting Function ---
def highlight_text(input_text, flagged):
    matches = []
    for entry in flagged:
        term = entry['Flagged Term']
        pattern = re.compile(re.escape(term), re.IGNORECASE)
        for match in pattern.finditer(input_text):
            matches.append({
                'start': match.start(),
                'end': match.end(),
                'text': match.group(0),
                'match_type': entry['Match Type'],
                'reason': entry['Flag Reason'],
                'eo': entry['Executive Orders'],
                'primary_terms': entry['Primary Terms'],
                'categories': entry['Categories']
            })
    matches.sort(key=lambda x: x['start'])
    final_matches = []
    last_end = -1
    for match in matches:
        if match['start'] >= last_end:
            final_matches.append(match)
            last_end = match['end']
    result = ""
    last_index = 0
    for match in final_matches:
        result += input_text[last_index:match['start']]
        color = '#FFA500' if match['match_type'] in ['Primary', 'Secondary'] else '#FFFF00'
        tooltip = (
            f"Reason for flagging: {match['reason']}\n"
            f"Primary Term(s): {', '.join(match['primary_terms'])}\n"
            f"Category(ies): {', '.join(match['categories'])}\n"
            f"Source: {match['eo']}"
        )
        result += f'<mark style="background-color:{color}" title="{tooltip}">{match["text"]}</mark>'
        last_index = match['end']
    result += input_text[last_index:]
    return HTML(f"<div style='white-space: pre-wrap; font-family: monospace;'>{result}</div>")

# --- Webpage Scraper ---
def fetch_url_text(url):
    print("Fetching URL:", url)
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
            tag.decompose()
        paragraphs = soup.find_all('p')
        cleaned_paragraphs = [p.get_text(separator=' ', strip=True) for p in paragraphs if p.get_text(strip=True)]
        body_text = "\n\n".join(cleaned_paragraphs)
        print("Successfully fetched and parsed. Final text length:", len(body_text))
        return body_text
    except Exception as e:
        print("Error:", e)
        return f"Error fetching URL: {e}"

# --- Widgets ---
text_input = widgets.Textarea(
    placeholder='Paste your text here...',
    layout=widgets.Layout(width='100%', height='150px'))

url_input = widgets.Text(
    placeholder='Or paste a URL to scan...',
    layout=widgets.Layout(width='100%'))

output_area = widgets.Output()
highlight_area = widgets.Output()

# --- Unified Handler ---
def handle_screening(input_value, get_text_func, label):
    output_area.clear_output()
    highlight_area.clear_output()
    flagged = []
    with output_area:
        if input_value.strip() == '':
            print(f'No {label} provided.')
            return
        text = get_text_func(input_value)
        if text.startswith("Error fetching URL:"):
            print(text)
            return
        print("Text length:", len(text))
        flagged = screen_text(text)
        print("Flags found:", len(flagged))
        if flagged:
            df_out = pd.DataFrame(flagged)[[
                'Flagged Term', 'Match Type', 'Primary Terms', 'Categories', 'Executive Orders'
            ]]
            df_out.index += 1
            display(df_out)
        else:
            print("No flagged terms found.")
    if flagged:
        with highlight_area:
            display(HTML(f"<b>Highlighted {label.capitalize()} (hover for rationale and source):</b>"))
            display(highlight_text(text, flagged))

# --- Buttons and Display ---
text_button = widgets.Button(description='Screen Text', button_style='success')
text_button.on_click(lambda _: handle_screening(text_input.value, lambda x: x, 'text'))

url_button = widgets.Button(description='Screen URL', button_style='info')
url_button.on_click(lambda _: handle_screening(url_input.value, fetch_url_text, 'URL'))

display(widgets.VBox([text_input, text_button, url_input, url_button, output_area, highlight_area]))