In [1]:
import os
import openai
import pandas as pd
from io import StringIO
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
# If you want to chunk PDF text, you can also import TextSplitter utilities:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
from dotenv import load_dotenv
import os

## 1. Configure OpenAI Keys

In [2]:
load_dotenv()
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')
# In code, you might do:
# openai.api_key = os.getenv("OPENAI_API_KEY")

## 2. Helper Functions

In [4]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract all text from a PDF file using PyPDF2.
    """
    text_content = []
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text_content.append(page.extract_text())
    return "\n".join(text_content)

In [5]:
import pandas as pd
import json

def summarize_excel_to_json(file_path):
    """
    Reads an Excel (.xlsx) file and creates a JSON summary of each sheet.
    For each column it provides:
      1. The variable (column) name.
      2. The inferred variable type (numeric, datetime, categorical).
      3. For numeric columns: standard summary statistics.
      4. For categorical columns: unique values and their counts.
      5. For datetime columns: the time span (min and max dates).

    Parameters:
        file_path (str): The path to the Excel file.

    Returns:
        str: A JSON string summarizing the workbook.
    """
    workbook_summary = {}
    # Load the Excel file
    xls = pd.ExcelFile(file_path)

    # Process each sheet in the workbook
    for sheet in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet)
        sheet_summary = []
        
        # Process each column in the sheet
        for col in df.columns:
            column_summary = {"variable_name": str(col)}
            series = df[col]

            # Infer the variable type: numeric, datetime, or categorical
            if pd.api.types.is_numeric_dtype(series):
                column_summary["variable_type"] = "numeric"
                # Compute summary statistics
                stats = series.describe()
                column_summary["summary_statistics"] = {
                    "count": stats.get("count"),
                    "mean": stats.get("mean"),
                    "std": stats.get("std"),
                    "min": stats.get("min"),
                    "25%": stats.get("25%"),
                    "50%": stats.get("50%"),
                    "75%": stats.get("75%"),
                    "max": stats.get("max")
                }
            elif pd.api.types.is_datetime64_any_dtype(series):
                column_summary["variable_type"] = "datetime"
                # Compute time span (min and max)
                times = series.dropna()
                if not times.empty:
                    column_summary["time_span"] = {
                        "start": str(times.min()),
                        "end": str(times.max())
                    }
                else:
                    column_summary["time_span"] = {"start": None, "end": None}
            else:
                column_summary["variable_type"] = "categorical"
                # Compute unique values and their counts
                uniques = series.value_counts(dropna=False)
                unique_values = []
                for value, count in uniques.items():
                    # Represent NaN values as the string "NaN"
                    value_str = "NaN" if pd.isna(value) else str(value)
                    unique_values.append({"value": value_str, "count": int(count)})
                column_summary["unique_values"] = unique_values

            sheet_summary.append(column_summary)
        workbook_summary[sheet] = sheet_summary

    # Convert the dictionary to a formatted JSON string
    json_str = json.dumps(workbook_summary, indent=2)
    return json_str

In [7]:
xlsx_path = "../data/poc_example_data/lazaro_et_al_2021.xlsx"

In [24]:
dataset_summary = summarize_excel_to_json(xlsx_path)

## 3. LangChain LLM Setup

In [10]:
summarization_llm = ChatOpenAI(
    temperature=0.0,
    model_name="gpt-4o" 
)

planner_llm = ChatOpenAI(
    temperature=1,
    model_name="o1-mini"
)

executor_llm = ChatOpenAI(
    temperature=0.0,
    model_name="o1-mini"
)

In [14]:
methods_summary_prompt = PromptTemplate(
    input_variables=["full_text"],
    template=(
        "You are a model specialized in ecological data analysis.\n"
        "Read the following text delimited by triple backticks and list all information on the dataset and statistical analyses used.\n"
        "Focus ONLY on:\n"
        "- Extracting all of the tested hypotheses.\n"
        "- Listing all variables that were collected.\n"
        "- Providing example values for categorical variables, and ranges for continuous variables (if available).\n"
        "- Listing all statistical methods with every detail of the performed analysis (specifically which functions, settings, and variables were used).\n"
        "Return only the report.\n"
        "```\n{full_text}\n```"
    )
)


planner_prompt = PromptTemplate(
    input_variables=["methodology_summary", "dataset_summary"],
    template=(
        "You are a planning model. Based on the methodology summary and dataset summary, "
        "plan a step-by-step routine (as programmatic pseudocode or structured steps) "
        "to execute the identified statistical analyses on the full dataset.\n"
        "Think how each analysis helps to answer indyvidual hypotheses.\n\n"
        "Methodology Summary:\n{methodology_summary}\n\n"
        "Dataset Summary:\n{dataset_summary}\n\n"
        "Provide your plan in XML format, with each <Step> containing a structured explanation "
        "of how to implement it programmatically, and which hypothesis it addresses."
    )
)


executor_prompt = PromptTemplate(
    input_variables=["analysis_plan_xml"],
    template=(
        "You are an executor model specialized in generating R scripts for each step. "
        "Given the plan in XML, do the following:\n"
        "1. Generate separate R scripts for each analysis step.\n"
        "1.1. Use variable names from the data summary.\n"
        "1.2. Check whether an analysis is suitable for the variable types.\n"
        "1.3. Provide suggestions in comments on alternativees analyses.\n"
        "2. Generate a single master R script that runs them all in a structured manner.\n"
        "Output your results clearly, indicating how the scripts should be saved.\n\n"
        "Plan XML:\n{analysis_plan_xml}.\n"
        "Dataset summary:: \n{dataset_summary}."
    )
)

In [15]:
# --- Step 1: Read PDF and extract methodology ---
pdf_path = "../data/poc_example_data/lazaro_et_al_2021_accessible.pdf"
full_text = extract_text_from_pdf(pdf_path)

In [18]:
chain = methods_summary_prompt | summarization_llm
methods_summary_result = chain.invoke({"full_text": full_text})

In [22]:
# Plan statistical analyses based on the methodology summary and the dataset summary
planner_chain = planner_prompt | planner_llm
plan_analyses_xml = planner_chain.invoke({
    "methodology_summary": methods_summary_result,
    "dataset_summary": dataset_summary
})

In [23]:
plan_analyses_xml

AIMessage(content='```xml\n<AnalysisPlan>\n    <Step number="1">\n        <Description>Load Required Libraries</Description>\n        <Details>\n            <Instruction>Ensure all necessary R packages are installed and loaded. This includes packages for statistical modeling, data manipulation, and network analysis.</Instruction>\n            <Code>\n                <![CDATA[\n                # Install packages if not already installed\n                required_packages <- c("lme4", "MuMIn", "car", "r2glmm", "piecewiseSEM", \n                                       "bipartite", "dplyr", "tidyr", "emmeans", "corrr")\n                installed_packages <- rownames(installed.packages())\n                for(pkg in required_packages){\n                    if(!pkg %in% installed_packages){\n                        install.packages(pkg)\n                    }\n                }\n\n                # Load libraries\n                library(lme4)\n                library(MuMIn)\n                

In [None]:
executor_chain = planner_prompt | planner_llm
executed_plan_steps = planner_chain.invoke({
    "analysis_plan_xml": plan_analyses_xml,
    "dataset_summary": dataset_summmary
})