In [1]:
import os
import openai
import pandas as pd
import json
from io import StringIO
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
# If you want to chunk PDF text, you can also import TextSplitter utilities:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')
# In code, you might do:
# openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract all text from a PDF file using PyPDF2.
    """
    text_content = []
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text_content.append(page.extract_text())
    return "\n".join(text_content)


def summarize_excel_folder_to_json(folder_path):
    """
    Reads all Excel (.xlsx) files in the specified folder and creates a JSON summary
    that includes each file and its sheets. For each sheet, the summary for every column includes:
      1. The variable (column) name.
      2. The inferred variable type (numeric, datetime, categorical).
      3. For numeric columns: standard summary statistics.
      4. For categorical columns: unique values and their counts.
      5. For datetime columns: the time span (min and max dates).

    The final JSON structure is organized with file names as top-level keys, and each file contains
    a dictionary of its sheet names and their summaries.

    Parameters:
        folder_path (str): The path to the folder containing Excel files.

    Returns:
        str: A JSON string summarizing all the Excel files and their sheets.
    """
    folder_summary = {}

    # Loop through each file in the provided folder
    for file_name in os.listdir(folder_path):
        # Only process files with a .xlsx extension
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                xls = pd.ExcelFile(file_path)
            except Exception as e:
                # If there is an error reading the file, record the error and continue
                folder_summary[file_name] = {"error": f"Failed to read file: {e}"}
                continue

            file_summary = {}
            # Process each sheet in the workbook
            for sheet in xls.sheet_names:
                try:
                    df = pd.read_excel(xls, sheet_name=sheet)
                except Exception as e:
                    file_summary[sheet] = {"error": f"Failed to read sheet: {e}"}
                    continue

                sheet_summary = []
                # Process each column in the sheet
                for col in df.columns:
                    column_summary = {"variable_name": str(col)}
                    series = df[col]

                    # Determine the type of the column: numeric, datetime, or categorical
                    if pd.api.types.is_numeric_dtype(series):
                        column_summary["variable_type"] = "numeric"
                        stats = series.describe()
                        column_summary["summary_statistics"] = {
                            "count": stats.get("count"),
                            "mean": stats.get("mean"),
                            "std": stats.get("std"),
                            "min": stats.get("min"),
                            "25%": stats.get("25%"),
                            "50%": stats.get("50%"),
                            "75%": stats.get("75%"),
                            "max": stats.get("max")
                        }
                    elif pd.api.types.is_datetime64_any_dtype(series):
                        column_summary["variable_type"] = "datetime"
                        times = series.dropna()
                        if not times.empty:
                            column_summary["time_span"] = {
                                "start": str(times.min()),
                                "end": str(times.max())
                            }
                        else:
                            column_summary["time_span"] = {"start": None, "end": None}
                    else:
                        column_summary["variable_type"] = "categorical"
                        uniques = series.value_counts(dropna=False)
                        unique_values = []
                        for value, count in uniques.items():
                            # Represent NaN values as the string "NaN"
                            value_str = "NaN" if pd.isna(value) else str(value)
                            unique_values.append({"value": value_str, "count": int(count)})
                        column_summary["unique_values"] = unique_values

                    sheet_summary.append(column_summary)
                file_summary[sheet] = sheet_summary

            folder_summary[file_name] = file_summary

    # Convert the complete folder summary into a formatted JSON string
    json_str = json.dumps(folder_summary, indent=2)
    return json_str


In [11]:
xlsx_path = "../data/poc_data_and_similar_paper"
dataset_summary = summarize_excel_folder_to_json(xlsx_path)

In [12]:
from pprint import pprint
pprint(dataset_summary)

('{\n'
 '  "fourth_corner_functional_diversity.xlsx": {\n'
 '    "wild_bee_traits": [\n'
 '      {\n'
 '        "variable_name": "Ecological traits of wild bees",\n'
 '        "variable_type": "categorical",\n'
 '        "unique_values": [\n'
 '          {\n'
 '            "value": "NaN",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_qua",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_maj",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_mal",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_min",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_mor",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_nit",\n'
 '            "count": 1\n'
 '          },\n'
 '          {\n'
 '            "value": "las_nid",\n'
 '     

In [9]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")

tokens = encoding.encode(dataset_summary)
print(len(tokens))

# To decode tokens back to text:
decoded_text = encoding.decode(tokens)

205128
