<a href="https://colab.research.google.com/github/yuugiouduele/AImodel/blob/main/Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Google Colab-ready: Paper parsing & precision-reading assistant pipeline (DEBUGGED)
# -----------------------------------------------------------------
# Goal:
# - Load a PDF (user-uploaded) or use a built-in dummy PDF
# - Extract text, equations, figure/table captions, and page/section boundaries
# - Produce: frequent-term ranking, term-definition dictionary (with contexts),
#   symbol list (variables, math tokens), and a heatmap of term occurrences by section/page
# - Provide helper functions to call the OpenAI (ChatGPT) API to fetch definitions/explanations
# - This notebook is intended as a runnable Colab script; it is robust to missing plotting
#   libraries (matplotlib) and will fall back to plotly or CSV outputs when necessary.

# USAGE in Colab:
# 1. Upload your PDF using the file upload widget or mount Google Drive
# 2. Run each cell in order. The pipeline will create CSV/PNG/HTML outputs and a small report.
# 3. Replace the OpenAI API key placeholder or set it as an environment variable if you want
#    automated LLM-based definitions (recommended: review them manually).

# -----------------------------------------------------------------
# Install dependencies (best-effort). If you run in a locked environment, you can skip this cell
# and ensure the listed packages exist in your environment.
!pip install --quiet PyMuPDF pdfplumber spacy scikit-learn wordcloud openai python-dateutil matplotlib plotly kaleido reportlab
!python -m spacy download en_core_web_sm

# -----------------------------------------------------------------
# Imports (deferred plotting imports are handled in helper functions)
import os
import re
import io
import json
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

import pdfplumber
import fitz  # PyMuPDF
import pandas as pd
import numpy as np

# WordCloud and spaCy
from wordcloud import WordCloud
import spacy

# Optional OpenAI client
try:
    import openai
except Exception:
    openai = None

# Load spaCy model (downloaded above)
nlp = spacy.load("en_core_web_sm")

# -----------------------------------------------------------------
# Utilities: robust plotting helpers that gracefully handle missing matplotlib

def _detect_plotting_backend():
    """Return plotting backend: 'matplotlib', 'plotly', or None"""
    try:
        import matplotlib.pyplot as _plt  # type: ignore
        return 'matplotlib'
    except Exception:
        try:
            import plotly.express as _px  # type: ignore
            import plotly.graph_objects as _go  # type: ignore
            return 'plotly'
        except Exception:
            return None

PLOTTING_BACKEND = _detect_plotting_backend()


def save_bar_chart(labels: List[str], values: List[int], filepath: str):
    """Save a bar chart to filepath. If matplotlib not available, try plotly; otherwise save CSV."""
    backend = PLOTTING_BACKEND
    if backend == 'matplotlib':
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(figsize=(max(6, len(labels)*0.4), 4))
        ax.bar(labels, values)
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.set_ylabel('Frequency')
        fig.tight_layout()
        fig.savefig(filepath)
        plt.close(fig)
        print(f'Saved bar chart as {filepath} using matplotlib')
    elif backend == 'plotly':
        import plotly.express as px
        fig = px.bar(x=labels, y=values, labels={'x':'Word','y':'Frequency'})
        # Prefer image export, fall back to HTML
        try:
            fig.write_image(filepath)
            print(f'Saved bar chart as {filepath} using plotly (image)')
        except Exception:
            html_path = filepath.replace('.png', '.html')
            fig.write_html(html_path)
            print(f'Plotly image export not available. Saved interactive HTML at {html_path}')
    else:
        # No plotting library available
        df = pd.DataFrame({'label': labels, 'value': values})
        csv_path = filepath.replace('.png', '.csv')
        df.to_csv(csv_path, index=False)
        print(f'No plotting library available. Saved CSV at {csv_path}')


def save_heatmap(matrix: np.ndarray, row_labels: List[str], col_labels: List[str], filepath: str):
    """Save heatmap image. If not possible, save CSV matrix."""
    backend = PLOTTING_BACKEND
    if backend == 'matplotlib':
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(figsize=(max(6, len(col_labels)*0.6), max(4, len(row_labels)*0.3)))
        cax = ax.imshow(matrix, aspect='auto', cmap='YlGnBu')
        fig.colorbar(cax, ax=ax, label='count')
        ax.set_yticks(range(len(row_labels)))
        ax.set_yticklabels(row_labels)
        ax.set_xticks(range(len(col_labels)))
        ax.set_xticklabels(col_labels, rotation=90)
        ax.set_xlabel('Page')
        fig.tight_layout()
        fig.savefig(filepath)
        plt.close(fig)
        print(f'Saved heatmap as {filepath} using matplotlib')
    elif backend == 'plotly':
        import plotly.express as px
        fig = px.imshow(matrix, labels=dict(x='Page', y='Term', color='count'), x=col_labels, y=row_labels)
        try:
            fig.write_image(filepath)
            print(f'Saved heatmap as {filepath} using plotly (image)')
        except Exception:
            html_path = filepath.replace('.png', '.html')
            fig.write_html(html_path)
            print(f'Plotly image export not available. Saved interactive HTML at {html_path}')
    else:
        # fallback to CSV
        df = pd.DataFrame(matrix, index=row_labels, columns=col_labels)
        csv_path = filepath.replace('.png', '.csv')
        df.to_csv(csv_path)
        print(f'No plotting library available. Saved matrix CSV at {csv_path}')


def save_wordcloud_from_frequencies(freq_series: pd.Series, filepath: str):
    """Generate and save a wordcloud from frequency series (word->freq). Uses WordCloud.to_file (Pillow required).
    If that fails, save frequencies as CSV instead."""
    try:
        wc = WordCloud(width=1200, height=600)
        wc.generate_from_frequencies(freq_series.to_dict())
        wc.to_file(filepath)
        print(f'Saved wordcloud image to {filepath}')
    except Exception as e:
        csv_path = filepath.replace('.png', '.csv')
        freq_series.to_frame('frequency').to_csv(csv_path)
        print(f'Could not save wordcloud image ({e}). Saved frequencies CSV at {csv_path}')

# -----------------------------------------------------------------
# Helper: create a small dummy PDF if none is provided (safe for commercial use)

def create_dummy_pdf(path: str):
    try:
        from reportlab.lib.pagesizes import A4
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
        from reportlab.lib.styles import getSampleStyleSheet
        from reportlab.lib import colors
    except Exception:
        raise RuntimeError('reportlab not available to create dummy PDF. Please upload a PDF.')

    styles = getSampleStyleSheet()
    doc = SimpleDocTemplate(path, pagesize=A4)
    story = []
    story.append(Paragraph('<b>Bayesian Spatio-Temporal Forecasting for Synthetic Weather Patterns</b>', styles['Title']))
    story.append(Spacer(1, 12))
    story.append(Paragraph('<b>Abstract</b>', styles['Heading2']))
    story.append(Paragraph('This is a synthetic dummy paper generated for testing the parsing pipeline. It contains repeated keywords and a few math-like tokens.', styles['BodyText']))
    story.append(Spacer(1, 12))
    story.append(Paragraph('<b>1. Introduction</b>', styles['Heading2']))
    story.append(Paragraph('Intro with keywords: Bayesian, temporal, latent, filter, weather, synthetic, Markov, Gaussian, model, variable.', styles['BodyText']))
    story.append(Spacer(1, 12))
    story.append(Paragraph('<b>2. Method</b>', styles['Heading2']))
    story.append(Paragraph('We define X_t, z_t and assume P(Y_t | X_t) = N(X_t, sigma^2) and P(X_t | X_{t-1}) = GP(X_{t-1}, k(.,.))', styles['BodyText']))
    story.append(Spacer(1, 12))
    story.append(Paragraph('<b>3. Results</b>', styles['Heading2']))
    story.append(Paragraph('Results mention RMSE, AUC, ROC and comparison to Kalman Filter and Markov Chain.', styles['BodyText']))
    story.append(Spacer(1, 12))
    data = [['Model', 'RMSE', 'AUC'], ['Markov Chain', '3.45', '0.81'], ['Kalman Filter', '3.12', '0.85'], ['Proposed Bayesian Filter', '2.94', '0.92']]
    table = Table(data)
    table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), ('GRID', (0, 0), (-1, -1), 1, colors.black)]))
    story.append(table)
    story.append(Spacer(1, 12))
    story.append(Paragraph('<b>4. Discussion</b>', styles['Heading2']))
    story.append(Paragraph('Discussion with keywords: hierarchical, forecasting, atmospheric, modeling, latent.', styles['BodyText']))
    doc.build(story)
    print(f'Created dummy PDF at {path}')

# -----------------------------------------------------------------
# PDF input: upload in Colab or use local path

try:
    # Try Colab upload widget
    from google.colab import files
    print('Upload a PDF file (or cancel to use built-in dummy)')
    uploaded = files.upload()
    if uploaded:
        pdf_path = list(uploaded.keys())[0]
    else:
        pdf_path = '/content/dummy_paper.pdf'
        if not os.path.exists(pdf_path):
            create_dummy_pdf(pdf_path)
except Exception:
    # Not running in Colab; try to use local path or create dummy
    pdf_path = 'dummy_paper.pdf'
    if not os.path.exists(pdf_path):
        create_dummy_pdf(pdf_path)

print(f'Using PDF: {pdf_path}')

# -----------------------------------------------------------------
# Function: extract raw text on a per-page basis using pdfplumber (better layout) and fallback to fitz

def extract_text_per_page(path: str) -> List[str]:
    pages = []
    try:
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                text = p.extract_text() or ""
                pages.append(text)
    except Exception as e:
        # fallback to PyMuPDF
        doc = fitz.open(path)
        for p in doc:
            pages.append(p.get_text())
    return pages

pages = extract_text_per_page(pdf_path)
print(f'Extracted {len(pages)} pages')

# -----------------------------------------------------------------
# Section splitter (heuristic): split by headings, long-line patterns, or page boundaries

def split_into_sections(pages: List[str]) -> List[Dict]:
    sections = []
    for i, text in enumerate(pages):
        if not text:
            sections.append({'page': i+1, 'section_title': None, 'text': ''})
            continue
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        title = None
        if lines:
            if re.match(r'^(abstract|introduction|method|results|discussion|references)', lines[0].lower()):
                title = lines[0]
        sections.append({'page': i+1, 'section_title': title, 'text': text})
    return sections

sections = split_into_sections(pages)

# -----------------------------------------------------------------
# Tokenization & cleanup utilities
stopwords = set([
    'the','and','for','of','to','a','in','on','is','we','our','was','by','this','with','at','an','as','be','are','that','it','from','both','which','these','have','has','or'
])

word_re = re.compile(r"[A-Za-z][A-Za-z0-9_\-]{2,}")

def tokenize(text: str) -> List[str]:
    text = re.sub(r"[^A-Za-z0-9_\-]", " ", text)
    tokens = [t.lower() for t in word_re.findall(text)]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

# -----------------------------------------------------------------
# 1) Frequent-term ranking (global / per-section)

def compute_frequencies(sections: List[Dict], topk: int=50) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
    global_counter = Counter()
    per_page_counters = {}
    for s in sections:
        tokens = tokenize(s['text'] or '')
        c = Counter(tokens)
        per_page_counters[s['page']] = pd.DataFrame(c.items(), columns=['word','freq']).sort_values('freq', ascending=False)
        global_counter.update(c)
    global_df = pd.DataFrame(global_counter.items(), columns=['word','frequency']).sort_values('frequency', ascending=False).reset_index(drop=True)
    return global_df.head(topk), per_page_counters

global_freq_df, per_page = compute_frequencies(sections, topk=200)

# Save CSV for inspection
global_freq_df.to_csv('global_frequency.csv', index=False)
print('Saved global_frequency.csv')

# -----------------------------------------------------------------
# 2) Term dictionary (definitions pulled from local context + optional external API)

def extract_contexts_for_term(term: str, sections: List[Dict], window_chars: int=200) -> Dict:
    contexts = []
    pages_set = set()
    pattern = re.compile(r"\\b" + re.escape(term) + r"\\b", flags=re.IGNORECASE)
    # Use raw word boundaries properly (note: compile with python's \b)
    # But if the term contains punctuation, re.escape ensures correct matching
    for s in sections:
        text = s['text'] or ''
        for m in pattern.finditer(text):
            start = max(0, m.start()-window_chars)
            end = min(len(text), m.end()+window_chars)
            snippet = text[start:end].replace('\n', ' ')
            contexts.append(snippet)
            pages_set.add(s['page'])
    freq_val = int(global_freq_df.loc[global_freq_df['word']==term,'frequency'].iloc[0]) if term in set(global_freq_df['word']) else 0
    return {'term': term, 'freq': freq_val, 'pages': sorted(list(pages_set)), 'contexts': contexts}

# Build preliminary term dictionary for top K
TOP_K = 40
term_dict = []
for term in global_freq_df['word'].head(TOP_K).tolist():
    term_dict.append(extract_contexts_for_term(term, sections))

# Save as JSON and CSV summary
with open('term_dictionary.json', 'w') as f:
    json.dump(term_dict, f, indent=2)

term_summary = pd.DataFrame([{'term': t['term'], 'freq': t['freq'], 'pages': t['pages'], 'num_contexts': len(t['contexts'])} for t in term_dict])
term_summary.to_csv('term_summary.csv', index=False)
print('Saved term_dictionary.json and term_summary.csv')

# -----------------------------------------------------------------
# 3) Symbol / math token extraction

def extract_symbols_from_text(text: str) -> List[str]:
    syms = set()
    # LaTeX inline math
    for m in re.finditer(r"\$([^$]{1,200})\$", text):
        syms.update(re.findall(r"[A-Za-z\\]+(?:_[a-zA-Z0-9]+)?(?:\^\{?[0-9a-zA-Z]+\}?)?", m.group(1)))
    # subscripts/superscripts like X_t, z_t, sigma^2
    syms.update(re.findall(r"[A-Za-z]+_[A-Za-z0-9]+", text))
    syms.update(re.findall(r"[A-Za-z]+\^[0-9]+", text))
    # common math objects
    syms.update(re.findall(r"\bGP\b|\bN\b|\bRMSE\b|\bAUC\b|\bROC\b", text))
    # single-letter variables esp. with context (space or punctuation around)
    syms.update(re.findall(r"\b[a-zA-Z]\b", text))
    cleaned = sorted([s for s in syms if len(s)>0])
    return cleaned

symbols = set()
for s in sections:
    symbols.update(extract_symbols_from_text(s['text'] or ''))

symbols = sorted(list(symbols))
with open('symbol_list.json', 'w') as f:
    json.dump(symbols, f, indent=2)
print('Saved symbol_list.json')

# -----------------------------------------------------------------
# 4) Heatmap: term occurrence by page/section
# We'll build a matrix: rows=top-N terms, cols=page numbers, entries=count

TOP_N = 30
terms = global_freq_df['word'].head(TOP_N).tolist()
num_pages = len(sections)
mat = np.zeros((len(terms), num_pages), dtype=int)
for i, t in enumerate(terms):
    pat = re.compile(r"\\b" + re.escape(t) + r"\\b", flags=re.IGNORECASE)
    for j, s in enumerate(sections):
        mat[i, j] = len(pat.findall(s['text'] or ''))

heat_df = pd.DataFrame(mat, index=terms, columns=[f'page_{p+1}' for p in range(num_pages)])
heat_df.to_csv('term_page_heatmap.csv')

# Plot / save heatmap using robust helper
save_heatmap(mat, row_labels=terms, col_labels=[str(i+1) for i in range(num_pages)], filepath='term_heatmap.png')
print('Saved term_page_heatmap.csv')

# -----------------------------------------------------------------
# Frequency bar chart for top 15
TOP_SHOW = min(15, len(global_freq_df))
top15 = global_freq_df.head(TOP_SHOW)
labels = top15['word'].tolist()
values = top15['frequency'].astype(int).tolist()
save_bar_chart(labels, values, 'top15_frequency.png')

# -----------------------------------------------------------------
# Wordcloud (save to file via WordCloud.to_file)
save_wordcloud_from_frequencies(global_freq_df.set_index('word')['frequency'], 'term_frequency_wordcloud.png')

# -----------------------------------------------------------------
# 5) Reproducible pseudocode extraction for methods / algorithms
ps_sections = []
for s in sections:
    text = s['text'] or ''
    if re.search(r'algorithm|we propose|we define|procedure|update|posterior|prior|likelihood', text, flags=re.IGNORECASE):
        ps_sections.append({'page': s['page'], 'text': text})

with open('candidate_method_paragraphs.txt','w') as f:
    for p in ps_sections:
        f.write(f"--- PAGE {p['page']} ---\n")
        f.write(p['text'][:4000].replace('\n','\n') + '\n\n')

print('Saved candidate_method_paragraphs.txt (for manual review)')

# -----------------------------------------------------------------
# 6) Optional: query OpenAI for term definitions (user must review)
# NOTE: This uses the OpenAI API. You must set OPENAI_API_KEY in the environment or pass it below.

def query_openai_definitions(terms: List[str], api_key: str=None, model: str='gpt-4o-mini') -> Dict[str, str]:
    if openai is None:
        raise RuntimeError('openai package not installed')
    if api_key is None:
        api_key = os.environ.get('OPENAI_API_KEY')
    if not api_key:
        raise RuntimeError('OpenAI API key not found. Set OPENAI_API_KEY environment variable or pass api_key.')
    openai.api_key = api_key
    results = {}
    for term in terms:
        prompt = f"Provide a concise (<= 60 words) technical definition of the term '{term}' as used in machine learning / statistical modeling. Include one-sentence note about typical pitfalls or assumptions. Respond in JSON with keys: term, definition, caveat."
        try:
            # Use ChatCompletion if available; this call may need adjustment depending on openai package version
            resp = openai.ChatCompletion.create(
                model=model,
                messages=[{"role":"user","content":prompt}],
                max_tokens=200,
                temperature=0.0,
            )
            out = resp['choices'][0]['message']['content']
            results[term] = out
        except Exception as e:
            results[term] = f'ERROR: {e}'
    return results

# Example usage (commented):
# defs = query_openai_definitions([t['term'] for t in term_dict[:20]], api_key='YOUR_API_KEY_HERE')
# with open('llm_term_definitions.json','w') as f:
#     json.dump(defs, f, indent=2)

# -----------------------------------------------------------------
# 7) Small utility: generate a markdown report that summarizes findings and links files
report_md = []
report_md.append('# Auto-generated parsing report')
report_md.append('\n')
report_md.append('Files saved in workspace:')
report_md.append('- global_frequency.csv')
report_md.append('- term_summary.csv')
report_md.append('- term_dictionary.json')
report_md.append('- symbol_list.json')
report_md.append('- term_page_heatmap.csv')
report_md.append('- term_heatmap.png (or fallback CSV/HTML)')
report_md.append('- top15_frequency.png (or fallback CSV/HTML)')
report_md.append('- term_frequency_wordcloud.png (or fallback CSV)')

with open('parsing_report.md','w') as f:
    f.write('\n'.join(report_md))
print('Saved parsing_report.md')

print('All done. Inspect CSV/JSON outputs and review candidate method paragraphs before using LLM to generate definitions.')

# -----------------------------------------------------------------
# NEXT STEPS (manual review + interactive workflow):
# - Open term_summary.csv and term_dictionary.json to review context snippets
# - Edit/curate the list of terms you want definitions for
# - Use query_openai_definitions(...) to request definitions from the ChatGPT API
# - Save the LLM outputs and manually accept/reject/modify definitions before using them in your notes

# End of Colab-ready script (debugged to handle missing matplotlib)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving 2012_JSLIS_Spring_webir122.pdf to 2012_JSLIS_Spring_webir122.pdf
Using PDF: 2012_JSLIS_Spring_webir122.pdf
Extracted 4 pages
Saved global_frequency.csv
Saved term_dictionary.json and term_summary.csv
Saved symbol_list.json
Saved heatmap as term_heatmap.png using matplotlib
Saved term_page_heatmap.csv
Saved bar chart as top15_frequency.png using matplotlib


  ax.set_xticklabels(labels, rotation=45, ha='right')


Saved wordcloud image to term_frequency_wordcloud.png
Saved candidate_method_paragraphs.txt (for manual review)
Saved parsing_report.md
All done. Inspect CSV/JSON outputs and review candidate method paragraphs before using LLM to generate definitions.
