In [31]:
import os
import json
import openai
# from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv

In [32]:
# Replace with your actual API key
load_dotenv()

OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')

# Set your API key
openai.api_key = OPENAI_API_KEY


In [33]:
ROLE = (
    "You are an expert in statistical analysis of ecological data. "
    "You are an expert in R Statistical Software. Users provide you with an analysis plan and a data summary. "
    "You need to provide the user with R code implementations."
)

INSTRUCTIONS = (
    "Provide implementations of a provided step of the data analysis plan. "
    "You need to iteratively write an R code for the provided step. "
    "If provided, consider also instructions that were performed in the previous step to keep the code consistent between the steps. "
    "Use the data summary for reliable implementations of the code. "
    "Think through every step of the analysis, such that the code can be implemented by the user without "
    "any significant changes. Write only the code, you can comment on the code, but do not provide "
    "detailed explanations. Add short explanation what was done in that step."
)

RESPONSE = (
    "Return the step explanation and code chunk in JSON format"
)

In [34]:
DATA_SUMMARY = {
    "dataset_path": "data/clean_data/parki_dataset_full_encoded.csv",
  "dataset_summary": [
    {
      "column_name": "INDEX_OF_INDIVIDUALS",
      "type": "continuous",
      "median": 2882,
      "min": 1,
      "max": 5763,
      "description": "Unique row identifier. Does not indicate the number of bees recorded per sample. Rows represents individual bees."
    },
    {
      "column_name": "Bee.species",
      "type": "categorical",
      "unique_values": 188,
      "description": "Lists the bee species by name, showcasing high diversity with 188 unique species."
    },
    {
      "column_name": "Species.code",
      "type": "categorical",
      "unique_values": 188,
      "description": "Provides a unique code for each bee species, matching the 188 species identified."
    },
    {
      "column_name": "Sex",
      "type": "categorical",
      "unique_values": 2,
      "description": "Indicates the sex of the bees, typically distinguishing between two categories (e.g., male and female)."
    },
    {
      "column_name": "Site.number",
      "type": "categorical",
      "unique_values": 22,
      "description": "Denotes the study site identifier, with 22 distinct sites included in the research."
    },
    {
      "column_name": "Year",
      "type": "continuous",
      "median": 2019,
      "min": 2018,
      "max": 2019,
      "description": "Records the year of observation, with data collected over 2018 and 2019."
    },
    {
      "column_name": "Month",
      "type": "categorical",
      "unique_values": 5,
      "description": "Specifies the month during which sampling occurred, with five distinct months represented."
    },
    {
      "column_name": "Day",
      "type": "continuous",
      "median": 7,
      "min": 1,
      "max": 28,
      "description": "Indicates the day of the month when sampling took place, ranging from the 1st to the 28th."
    },
    {
      "column_name": "Family",
      "type": "categorical",
      "unique_values": 6,
      "description": "Categorizes bees into six different families based on taxonomic classification."
    },
    {
      "column_name": "Social.behavior",
      "type": "categorical",
      "unique_values": 3,
      "description": "Classifies bees according to their social behavior (e.g., solitary, social, or eusocial)."
    },
    {
      "column_name": "Nesting.place",
      "type": "categorical",
      "unique_values": 4,
      "description": "Describes the nesting locations or preferences of bees, divided into four distinct types."
    },
    {
      "column_name": "Floral.specificity",
      "type": "categorical",
      "unique_values": 3,
      "description": "Indicates the level of floral specialization, with three distinct categories of specificity."
    },
    {
      "column_name": "Flight.beginning.period",
      "type": "categorical",
      "unique_values": 3,
      "description": "Represents the period when bee flight activity begins, grouped into three distinct periods."
    },
    {
      "column_name": "End.of.flight.period",
      "type": "categorical",
      "unique_values": 3,
      "description": "Represents the period when bee flight activity concludes, also divided into three groups."
    },
    {
      "column_name": "Lifespan.[month]",
      "type": "continuous",
      "median": 5,
      "min": 1,
      "max": 8,
      "description": "Estimates the lifespan of bees in months, with a typical value of 5 months."
    },
    {
      "column_name": "Voltinism",
      "type": "categorical",
      "unique_values": 2,
      "description": "Indicates the number of generations per year, such as univoltine versus multivoltine patterns."
    },
    {
      "column_name": "Pollen.carrying-structure",
      "type": "categorical",
      "unique_values": 5,
      "description": "Describes the type of pollen-carrying structure present in bees, divided into five categories."
    },
    {
      "column_name": "Mean.body.size",
      "type": "categorical",
      "unique_values": 3,
      "description": "Classifies bees based on their average body size into three size categories."
    },
    {
      "column_name": "Rarity",
      "type": "categorical",
      "unique_values": 3,
      "description": "Categorizes species based on their occurrence frequency, potentially as common, uncommon, or rare."
    },
    {
      "column_name": "Landscape.type",
      "type": "categorical",
      "unique_values": 2,
      "description": "Denotes the type of landscape (e.g., urban vs. rural) where the bees were observed."
    },
    {
      "column_name": "Latitude.(N)",
      "type": "categorical",
      "unique_values": 22,
      "description": "Specifies the latitude coordinates of the study sites, with 22 unique values indicating different locations."
    },
    {
      "column_name": "Longitude.(E)",
      "type": "categorical",
      "unique_values": 22,
      "description": "Specifies the longitude coordinates of the study sites, with 22 distinct values corresponding to site positions."
    },
    {
      "column_name": "Shortest.distance.between.sites.[m]",
      "type": "continuous",
      "median": 1547.610137,
      "min": 527.7230492,
      "max": 20068.51655,
      "description": "Measures the shortest distance in meters between sites, reflecting the spatial separation in the study."
    },
    {
      "column_name": "Year.of.research",
      "type": "continuous",
      "median": 2019,
      "min": 2018,
      "max": 2019,
      "description": "Denotes the year during which the research was conducted, mirroring the Year variable."
    },
    {
      "column_name": "Coverage.of.bee.food.plant.species.[%]",
      "type": "continuous",
      "median": 40,
      "min": 17,
      "max": 65,
      "description": "Represents the percentage coverage of bee food plant species in the study area, indicating resource availability."
    },
    {
      "column_name": "Floral.richness",
      "type": "continuous",
      "median": 67,
      "min": 32,
      "max": 130,
      "description": "Counts the number of floral species in the area, serving as an index of plant diversity."
    },
    {
      "column_name": "Alien.floral.richness.[%]",
      "type": "continuous",
      "median": 30.3030303,
      "min": 21.95121951,
      "max": 39.70588235,
      "description": "Indicates the percentage of non-native (alien) floral species, reflecting exotic plant influence."
    },
    {
      "column_name": "Native.floral.richness.[%]",
      "type": "continuous",
      "median": 53.07692308,
      "min": 32.22222222,
      "max": 75.75757576,
      "description": "Shows the percentage of native floral species, which is important for assessing ecosystem integrity."
    },
    {
      "column_name": "Spontaneous.floral.richness.[%]",
      "type": "continuous",
      "median": 86.76470588,
      "min": 56.09756098,
      "max": 100,
      "description": "Reflects the percentage of spontaneously occurring floral species, indicating natural regeneration or disturbance."
    },
    {
      "column_name": "Ornamental.floral.richness.[%]",
      "type": "continuous",
      "median": 13.23529412,
      "min": 0,
      "max": 43.90243902,
      "description": "Represents the percentage of ornamental floral species, likely influenced by human landscaping."
    },
    {
      "column_name": "Age.[years]",
      "type": "continuous",
      "median": 119,
      "min": 18,
      "max": 403,
      "description": "Represents the age of the study sites or habitat elements in years, indicating maturity and development stage."
    },
    {
      "column_name": "Area.size.[m2]",
      "type": "continuous",
      "median": 45067.99344,
      "min": 5969.242905,
      "max": 344309.0159,
      "description": "Indicates the area size in square meters of the study sites, relevant for spatial analyses."
    },
    {
      "column_name": "Bare.ground.[%]",
      "type": "continuous",
      "median": 28,
      "min": 5,
      "max": 36,
      "description": "Shows the percentage of bare ground, which can influence habitat structure and resource availability."
    },
    {
      "column_name": "Perimeter–area.ratio",
      "type": "continuous",
      "median": 0.031025927,
      "min": 0.011625223,
      "max": 0.062479583,
      "description": "Measures the shape complexity of sites by comparing perimeter to area; lower values indicate simpler shapes."
    },
    {
      "column_name": "Isolation.[100.m.buffer]",
      "type": "continuous",
      "median": 6,
      "min": 2,
      "max": 12,
      "description": "Represents an isolation metric within a 100m buffer, suggesting how separated a site is from others."
    },
    {
      "column_name": "Distance.to.the.city.centre.[m]",
      "type": "continuous",
      "median": 10756.87412,
      "min": 163.0170365,
      "max": 21571.26415,
      "description": "Indicates the distance in meters from the study site to the city center, relating to urban influence."
    },
    {
      "column_name": "Trees.and.shrubs.in.buffer.250.m.[%]",
      "type": "continuous",
      "median": 4.292972078,
      "min": 0.147242752,
      "max": 39.76559046,
      "description": "Shows the percentage cover of trees and shrubs within a 250m buffer, reflecting local vegetation structure."
    },
    {
      "column_name": "Grasslands.in.buffer.250.m.[%]",
      "type": "continuous",
      "median": 15.32329709,
      "min": 4.706143642,
      "max": 53.44952191,
      "description": "Indicates the percentage of grassland cover within a 250m buffer, contributing to habitat diversity."
    },
    {
      "column_name": "Trees.and.shrubs.in.bufffer.500.m.[%]",
      "type": "continuous",
      "median": 7.008324127,
      "min": 0.573267487,
      "max": 52.4968665,
      "description": "Measures tree and shrub cover within a 500m buffer, providing insight into broader vegetation patterns."
    },
    {
      "column_name": "Grasslands.in.bufffer.500.m.[%]",
      "type": "continuous",
      "median": 15.77397285,
      "min": 4.313846341,
      "max": 46.2621473,
      "description": "Represents grassland cover within a 500m buffer, an important factor in assessing landscape composition."
    },
    {
      "column_name": "Trees.and.shrubs.in.bufffer.750.m.[%]",
      "type": "continuous",
      "median": 7.828027389,
      "min": 0.51212806,
      "max": 61.11066362,
      "description": "Indicates the percentage of trees and shrubs within a 750m buffer, reflecting mid-scale habitat structure."
    },
    {
      "column_name": "Grasslands.in.bufffer.750.m.[%]",
      "type": "continuous",
      "median": 13.12889008,
      "min": 2.359417764,
      "max": 43.0540798,
      "description": "Represents grassland cover within a 750m buffer, helping to define the natural landscape mosaic."
    },
    {
      "column_name": "Trees.and.shrubs.in.bufffer.1000.m.[%]",
      "type": "continuous",
      "median": 7.952987246,
      "min": 1.516748938,
      "max": 60.95802368,
      "description": "Shows the percentage cover of trees and shrubs within a 1000m buffer, highlighting broader vegetation patterns."
    },
    {
      "column_name": "Grasslands.in.bufffer.1000.m.[%]",
      "type": "continuous",
      "median": 12.06115459,
      "min": 2.192880242,
      "max": 38.11344158,
      "description": "Indicates grassland cover within a 1000m buffer, contributing to an understanding of the regional habitat."
    },
    {
      "column_name": "Trees.and.shrubs.in.bufffer.1500.m.[%]",
      "type": "continuous",
      "median": 12.17479278,
      "min": 2.059115632,
      "max": 58.33609893,
      "description": "Reflects tree and shrub cover within a 1500m buffer, offering a view of landscape structure at a larger scale."
    },
    {
      "column_name": "Grasslands.in.bufffer.1500.m.[%]",
      "type": "continuous",
      "median": 13.7588085,
      "min": 2.228607565,
      "max": 40.93716118,
      "description": "Measures grassland cover within a 1500m buffer, highlighting the availability of open habitat over a broad area."
    },
    {
      "column_name": "Landscape.diversity.in.buffer.250.m",
      "type": "continuous",
      "median": 1.086,
      "min": 0.317,
      "max": 1.39,
      "description": "Quantifies the diversity of landscape elements within a 250m buffer, serving as an indicator of habitat complexity."
    },
    {
      "column_name": "Landscape.diversity.in.buffer.500.m",
      "type": "continuous",
      "median": 0.977,
      "min": 0.325,
      "max": 1.28,
      "description": "Represents landscape diversity within a 500m buffer, reflecting the mix of habitat types nearby."
    },
    {
      "column_name": "Landscape.diversity.in.buffer.750.m",
      "type": "continuous",
      "median": 0.854,
      "min": 0.271,
      "max": 1.332,
      "description": "Indicates the diversity of landscape features within a 750m buffer, useful for assessing habitat heterogeneity."
    },
    {
      "column_name": "Landscape.diversity.in.buffer.1000.m",
      "type": "continuous",
      "median": 0.853,
      "min": 0.402,
      "max": 1.348,
      "description": "Measures landscape diversity within a 1000m buffer, providing insight into regional habitat variability."
    },
    {
      "column_name": "Landscape.diversity.in.buffer.1500.m",
      "type": "continuous",
      "median": 0.876,
      "min": 0.499,
      "max": 1.325,
      "description": "Reflects the diversity of landscape elements within a 1500m buffer, indicating larger scale habitat complexity."
    },
    {
      "column_name": "Impervious.surface.area.in.buffer.250.m.[mean]",
      "type": "continuous",
      "median": 23.83140244,
      "min": 0.506483402,
      "max": 81.18436238,
      "description": "Shows the mean percentage of impervious surface area within a 250m buffer, an indicator of urban development."
    },
    {
      "column_name": "Impervious.surface.area.in.buffer.500.m.[mean]",
      "type": "continuous",
      "median": 11.12595028,
      "min": 0.197337713,
      "max": 73.5324587,
      "description": "Represents the average impervious surface area within a 500m buffer, reflecting urban influence at a medium scale."
    },
    {
      "column_name": "Impervious.surface.area.in.buffer.750.m.[mean]",
      "type": "continuous",
      "median": 7.367754967,
      "min": 0.122678988,
      "max": 67.17070936,
      "description": "Indicates the mean impervious surface area within a 750m buffer, useful for assessing urban encroachment."
    },
    {
      "column_name": "Impervious.surface.area.in.buffer.1000.m.[mean]",
      "type": "continuous",
      "median": 5.570115861,
      "min": 0.262156284,
      "max": 64.88584401,
      "description": "Provides the average impervious surface area in a 1000m buffer, representing urban development at a larger scale."
    },
    {
      "column_name": "Impervious.surface.area.in.buffer.1500.m.[mean]",
      "type": "continuous",
      "median": 4.24962184,
      "min": 0.590699089,
      "max": 61.13071545,
      "description": "Represents the mean percentage of impervious surfaces within a 1500m buffer, highlighting low urban influence at this scale."
    },
    {
      "column_name": "Population.density.in.buffer.250.m",
      "type": "continuous",
      "median": 681.8019142,
      "min": 8.523577325,
      "max": 3089.265503,
      "description": "Measures the population density within a 250m buffer, offering insight into local human settlement intensity."
    },
    {
      "column_name": "Population.density.in.buffer.500.m",
      "type": "continuous",
      "median": 1176.052433,
      "min": 27.97789469,
      "max": 8411.295839,
      "description": "Represents population density within a 500m buffer, indicating the degree of urban or suburban influence."
    },
    {
      "column_name": "Population.density.in.buffer.750.m",
      "type": "continuous",
      "median": 1493.333762,
      "min": 61.44022084,
      "max": 16160.05735,
      "description": "Shows the population density within a 750m buffer, reflecting human activity around the sites."
    },
    {
      "column_name": "Population.density.in.buffer.1000.m",
      "type": "continuous",
      "median": 1695.737196,
      "min": 116.4302356,
      "max": 25562.41434,
      "description": "Indicates the population density within a 1000m buffer, useful for assessing regional urban pressures."
    },
    {
      "column_name": "Population.density.in.buffer.1500.m",
      "type": "continuous",
      "median": 2260.509115,
      "min": 314.6977556,
      "max": 51698.95162,
      "description": "Measures the population density within a 1500m buffer, capturing the broader human settlement context."
    }
  ]
}


In [35]:
OTHER_PLACEHOLDERS = ()

In [36]:
analysis_plan_steps = [ 
{"step_number": 1, "step_title": "Data Preparation and Exploration", "detailed_instructions": ["1.1 Import the dataset named 'parki_dataset_full' that contains all variables listed in the data summary.", "1.2 Remove any potential duplicate rows identified by 'INDEX_OF_INDIVIDUALS' or inconsistent entries. Inspect for missing values in critical columns such as 'Bee.species', 'Landscape.type', 'Site.number', and the key local/landscape variables (e.g., 'Coverage.of.bee.food.plant.species.[%]', 'Impervious.surface.area.in.buffer.500.m.[mean]', etc.).", "1.3 If missing data is found, consider data imputation (e.g., mean substitution for continuous variables, or removing rows if missing data is not recoverable). Ensure decisions are documented.", "1.4 Convert all categorical variables to factors (e.g., 'Bee.species', 'Sex', 'Landscape.type', 'Site.number') so they can be used in later models."], "anticipated_issues_and_modifications": ["Data might have missing or erroneous values for numeric columns such as 'Coverage.of.bee.food.plant.species.[%]'. Consider removing or imputing these rows if they are critical to the analysis.", "Categorical variables like 'Bee.species' may have spelling inconsistencies. Standardize their spelling or codes."], "improvement_suggestions": [ "Before proceeding, run basic summary statistics on each column (e.g., median, range) to confirm alignment with the dataset_summary and detect anomalies early." ] }, 
{"step_number": 2, "step_title": "Summaries of Bee Abundance and Species Richness", "detailed_instructions": [ "2.1 Aggregate data by 'Site.number' to get total abundance (count of 'INDEX_OF_INDIVIDUALS') and species richness (count of unique 'Bee.species') at each site.", "2.2 Create two new variables for each site: 'Total.Abundance' and 'Observed.Richness'. Store these in a new summarized dataset (e.g., 'site_level_data').", "2.3 Merge this summarized dataset back with local/landscape predictors (e.g., 'Coverage.of.bee.food.plant.species.[%]', 'Floral.richness', 'Impervious.surface.area.in.buffer.500.m.[mean]', 'Population.density.in.buffer.500.m', 'Area.size.[m2]', 'Isolation.[100.m.buffer]', 'Landscape.diversity.in.buffer.500.m', etc.), ensuring each row corresponds to a unique site." ], "anticipated_issues_and_modifications": [ "Ensure each site in 'site_level_data' matches exactly with local and landscape variables (potential site numbering mismatches).", "If a site has very few observations (e.g., n < 5), consider whether to remove or carefully interpret it in further modeling." ], "improvement_suggestions": [ "Inspect outlier sites (extremely high or low abundance/richness). Decide whether these outliers should be retained or examined separately." ] }, 
{"step_number": 3, "step_title": "Species Accumulation and Sampling Effort Curves", "detailed_instructions": [ "3.1 Use the package iNEXT or a similar function to calculate species accumulation curves across pooled samples. The key variables are 'INDEX_OF_INDIVIDUALS' and 'Bee.species'.", "3.2 Plot the accumulation curves to check if sampling effort is sufficient, comparing across 'Landscape.type' (urban vs. rural).", "3.3 Determine if further subsampling or rarefaction is required. If so, apply rarefaction to standardize sampling effort across sites." ], "anticipated_issues_and_modifications": [ "If curves do not approach an asymptote, consider additional sampling or interpret results with caution.", "Data for certain 'Site.number' groups may be under-sampled; rarefy to the lowest sampling intensity if appropriate." ], "improvement_suggestions": [ "Use 84% confidence intervals (following MacGregor-Fors and Payton) to check significant differences between curves, adjusting if a different confidence level is appropriate for your study design." ] }, 
{"step_number": 4, "step_title": "Check for Spatial Autocorrelation", "detailed_instructions": [ "4.1 Extract the site coordinates using 'Latitude.(N)' and 'Longitude.(E)' for each 'Site.number'.", "4.2 For total abundance ('Total.Abundance') and observed richness ('Observed.Richness'), apply a log transformation (if necessary) to normalize the data.", "4.3 Use Moran’s I test (e.g., in the 'ape' or 'spdep' package) to detect spatial autocorrelation among the sites for these response variables." ], "anticipated_issues_and_modifications": [ "If Moran’s I is significant, consider adding spatial random effects or including coordinates in a spatial model (e.g., using a spatial correlation structure).", "If site coordinates are coarse or have rounding errors, this may reduce the reliability of Moran’s I results." ], "improvement_suggestions": [ "If spatial autocorrelation is detected, use geostatistical approaches (e.g., autocovariate terms) or spatially explicit mixed models to account for non-independence." ] }, 
{"step_number": 5, "step_title": "Modeling Abundance and Species Richness", "detailed_instructions": [ "5.1 Fit two generalized linear mixed models (GLMMs) using the 'lme4' or 'glmmTMB' package: one for 'Total.Abundance' (possibly negative binomial) and one for 'Observed.Richness' (possibly Poisson).", "5.2 Include 'Landscape.type' (urban vs. rural) as a fixed predictor to test the hypothesis that rural areas have higher bee abundance/richness.", "5.3 Add critical local covariates such as 'Coverage.of.bee.food.plant.species.[%]', 'Floral.richness', 'Area.size.[m2]', 'Isolation.[100.m.buffer]', and 'Bare.ground.[%]'.", "5.4 Add selected landscape covariates to capture urbanization effects, e.g., 'Impervious.surface.area.in.buffer.500.m.[mean]' and 'Population.density.in.buffer.500.m'.", "5.5 Specify random intercepts for 'Site.number' if multiple measurements per site exist, and random intercept for 'Month' or 'Year' if seasonality/year effects are relevant.", "5.6 Compare models with and without each local or landscape predictor to see the effect on 'Landscape.type' significance and to detect the strongest predictors. Use AIC or BIC for model comparison." ], "anticipated_issues_and_modifications": [ "If overdispersion is detected in Poisson models for richness, switch to a negative binomial distribution or add an observation-level random effect.", "Check multicollinearity (Variance Inflation Factor) among covariates. If VIF > 5, remove or transform correlated variables." ], "improvement_suggestions": [ "Standardize continuous predictors (e.g., scale and center) to facilitate model convergence and interpretability.", "Visualize residuals vs. fitted values and partial residual plots to confirm model fit." ] }, { "step_number": 6, "step_title": "Beta Diversity and Partitioning of Diversity", "detailed_instructions": [ "6.1 Calculate site-level bee community composition using the presence-absence or abundance data (i.e., number of individuals per 'Bee.species').", "6.2 Use the 'betapart' or 'vegan' package to decompose beta diversity into turnover and nestedness components across 'Landscape.type'.", "6.3 Compare alpha and beta components among urban vs. rural sites to see their relative contributions to overall gamma diversity. Summarize how much of the difference is due to species turnover vs. nestedness." ], "anticipated_issues_and_modifications": [ "Large differences in sample sizes between sites can bias beta diversity measures. Consider rarefying or weighting by sampling effort if needed.", "If certain sites have zero or very few individuals, they might artificially inflate nestedness components." ], "improvement_suggestions": [ "Use visualizations (e.g., NMDS or PCoA ordinations) to detect clustering patterns by 'Landscape.type'.", "Partitioning alpha, beta, and gamma diversity helps clarify whether management efforts should focus on local site enhancement vs. landscape connectivity." ] }, { "step_number": 7, "step_title": "Redundancy Analysis (RDA)", "detailed_instructions": [ "7.1 Create a site-by-species matrix from 'Bee.species' counts or Hellinger-transformed abundances per site. The rows correspond to 'Site.number' and the columns to each unique 'Bee.species'.", "7.2 Scale and center the environmental predictors: 'Coverage.of.bee.food.plant.species.[%]', 'Area.size.[m2]', 'Isolation.[100.m.buffer]', 'Bare.ground.[%]', 'Impervious.surface.area.in.buffer.500.m.[mean]', 'Population.density.in.buffer.500.m', 'Landscape.diversity.in.buffer.500.m', and 'Landscape.type' coded as numeric/factor where appropriate.", "7.3 Perform RDA using the 'vegan' package, with the species matrix as the response and the scaled predictors as explanatory variables.", "7.4 Run a permutation test (e.g., 'anova.cca') to check significance of the model, each axis, and individual predictors." ], "anticipated_issues_and_modifications": [ "High dimensionality with many bee species can complicate RDA interpretation. Consider focusing on the most abundant species or use dimensionality-reduction first.", "If 'Landscape.type' is a factor, incorporate it carefully in RDA (e.g., dummy variables) or test it separately in partial RDA." ], "improvement_suggestions": [ "Check for outliers in the site-by-species matrix. Strong outliers can drive RDA axes and distort results.", "Plot the RDA biplot to visualize how local and landscape variables relate to major gradients in bee communities." ] }, { "step_number": 8, "step_title": "Trait-based Analyses (Fourth-Corner or RLQ)", "detailed_instructions": [ "8.1 Create a matrix of species-level traits from columns such as 'Social.behavior', 'Nesting.place', 'Floral.specificity', 'Mean.body.size', 'Voltinism', and 'Pollen.carrying-structure'.", "8.2 Aggregate the bee occurrence data at the site level (no monthly division), producing a site-by-species abundance matrix as in Step 7.", "8.3 Use the 'traitglm' function (mvabund package) or a fourth-corner approach to relate species traits to environmental predictors (e.g., 'Impervious.surface.area.in.buffer.500.m.[mean]', 'Population.density.in.buffer.500.m', 'Coverage.of.bee.food.plant.species.[%]', etc.). Apply a negative binomial distribution for abundance if necessary.", "8.4 Use resampling/permutation tests to check the significance of trait-environment relationships (e.g., specialized traits vs. generalist traits in urban vs. rural sites)." ], "anticipated_issues_and_modifications": [ "A large number of traits or correlated traits can reduce interpretability. Consider grouping traits or removing highly correlated ones.", "Sparse species-by-site matrices may cause convergence issues in the model; reduce dimension or consider penalized likelihood methods." ], "improvement_suggestions": [ "Apply LASSO or shrinkage methods to handle high dimensional data and to simplify trait-environment relationships.", "Include only traits with a clear ecological interpretation relevant to urban/rural gradients (e.g., 'Mean.body.size' may help test the hypothesis of smaller bees dominating urban landscapes)." ] }, { "step_number": 9, "step_title": "Functional Diversity Indices", "detailed_instructions": [ "9.1 Compute site-level functional diversity indices (FEve, FDis, RaoQ, FDiv) using the 'dbFD' function in the FD package. Use the trait matrix (Step 8.1) and the site-by-species abundance matrix (Step 8.2).", "9.2 Combine these functional metrics back into 'site_level_data' by matching on 'Site.number'.", "9.3 Fit GLMMs or generalized linear models (e.g., beta regression via 'glmmTMB') for each functional index as a response variable, with 'Landscape.type' plus local/landscape predictors (e.g., 'Impervious.surface.area.in.buffer.500.m.[mean]', 'Coverage.of.bee.food.plant.species.[%]') as fixed effects.", "9.4 Evaluate whether functional diversity differs significantly between urban and rural sites and identify which environmental factors drive differences." ], "anticipated_issues_and_modifications": [ "Functional indices bounded between 0 and 1 (e.g., FEve, FDiv) may require a beta distribution with a logit link function.", "If a site has extremely low species count, the functional diversity metrics might be unstable." ], "improvement_suggestions": [ "Plot each functional diversity measure against selected predictors to identify non-linear relationships. If strong non-linearity is detected, consider GAMs or polynomial terms." ] }, { "step_number": 10, "step_title": "Interpretation and Final Checks", "detailed_instructions": [ "10.1 Synthesize results from steps 5 through 9 to address the main hypotheses: (a) Are rural sites higher in abundance/richness than urban? (b) Are bee traits (smaller body size, polylectic, social) more prevalent in urban sites? (c) Does impervious surface area or population density negatively correlate with bee diversity? (d) Is beta diversity driven more by turnover or nestedness between urban and rural parks?", "10.2 Provide a cohesive discussion on whether local factors (e.g., 'Coverage.of.bee.food.plant.species.[%]') or landscape factors (e.g., 'Impervious.surface.area.in.buffer.500.m.[mean]', 'Population.density.in.buffer.500.m') have stronger effects.", "10.3 Summarize limitations encountered (e.g., missing data, small sample size, or potential spatial autocorrelation) and propose future data collection or methodological improvements." ], "anticipated_issues_and_modifications": [ "Inconsistent or ambiguous trait definitions might limit the clarity of trait-environment relationships. Revisit the dataset if necessary.", "If results contradict expectations, check model assumptions (overdispersion, zero-inflation, linearity)." ], "improvement_suggestions": [ "Conduct sensitivity analyses, such as excluding certain sites or species groups, to verify the robustness of conclusions.", "Combine these findings with ecological knowledge to propose targeted conservation or management strategies for urban and rural parks." ] } ]

In [37]:
step_results = []
previous_responses = []

In [38]:
def build_prompt(step: dict, previous_responses: list[str]) -> dict:
    previous_steps_text = "\n".join(
        f"Previous step {i+1} response: {resp}" for i, resp in enumerate(previous_responses)
    )
    
    prompt_dict = {
    "role": ROLE,
    "instructions": INSTRUCTIONS,
    "analysis_plan_step": step,  # your current step dictionary
    "response": RESPONSE,
    "data_summary": DATA_SUMMARY,
    "previous_responses": previous_responses,  # string with previous responses
    "other_placeholders": OTHER_PLACEHOLDERS,
}

    return json.dumps(prompt_dict, indent=2)

In [39]:
initial_prompt = build_prompt(analysis_plan_steps[0], previous_responses)

In [40]:
for step in analysis_plan_steps:
    print(step)
    
    prompt = build_prompt(step, previous_responses)

    response = openai.ChatCompletion.create(
        model="o1-mini",
        messages=[{"role": "user", "content": prompt}],
        )
    
    response_text = response['choices'][0]['message']['content']

    # Attempt to parse JSON from the response
    try:
        parsed = json.loads(response_text)
    except json.JSONDecodeError:
        parsed = {"raw_response": response_text}

    step_results.append(parsed)
    previous_responses.append(response_text)

    print(response_text)

# Print final results
for idx, result in enumerate(step_results, start=1):
    print(f"\n--- Step {idx} Result ---")
    print(result)


{'step_number': 1, 'step_title': 'Data Preparation and Exploration', 'detailed_instructions': ["1.1 Import the dataset named 'parki_dataset_full' that contains all variables listed in the data summary.", "1.2 Remove any potential duplicate rows identified by 'INDEX_OF_INDIVIDUALS' or inconsistent entries. Inspect for missing values in critical columns such as 'Bee.species', 'Landscape.type', 'Site.number', and the key local/landscape variables (e.g., 'Coverage.of.bee.food.plant.species.[%]', 'Impervious.surface.area.in.buffer.500.m.[mean]', etc.).", '1.3 If missing data is found, consider data imputation (e.g., mean substitution for continuous variables, or removing rows if missing data is not recoverable). Ensure decisions are documented.', "1.4 Convert all categorical variables to factors (e.g., 'Bee.species', 'Sex', 'Landscape.type', 'Site.number') so they can be used in later models."], 'anticipated_issues_and_modifications': ["Data might have missing or erroneous values for numeri