Extract analysis pipline from doc strings in the R script file.

In [2]:
# Python script to extract comments (lines starting with '#') from an R script
def extract_comments(input_path, output_path):
    try:
        with open(input_path, "r") as infile, open(output_path, "w") as outfile:
            for line in infile:
                stripped_line = line.lstrip()
                if stripped_line.startswith("#"):  # Identify comment lines
                    outfile.write(line)  # Write comment to output file
        print(f"Extraction successful! Comments saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Specify the input R script and output text file
input_path = "Src/urban_wealth_scale.R"  # Replace with the actual path to your R script
output_path = "Src/code_docs_outline.txt"

# Run the extraction
extract_comments(input_path, output_path)

Extraction successful! Comments saved to Src/code_docs_outline.txt


Get all the output filenames for processing.

In [None]:
import os
import csv

# Define the directory and output CSV filename
output_folder = "../Output/"
csv_filename = "output_filenames.csv"

# Get all filenames in the Output folder
file_list = os.listdir(output_folder)

# Write filenames to a CSV file
with open(csv_filename, mode='w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header
    csv_writer.writerow(["Filename"])
    # Write each filename as a row
    for filename in file_list:
        csv_writer.writerow([filename])

print(f"Filenames from '{output_folder}' written to '{csv_filename}'")


Filenames from 'Output/' written to 'Output/output_filenames.csv'


In [8]:
import pandas as pd
import os

# Load the pre-sorted CSV file
file_path = "sorted_filenames_metadata.csv"  # This should be in the Supplement folder
df = pd.read_csv(file_path)

# Function to save individual tables as Markdown
def save_table(df_subset, filename):
    with open(filename, "w") as f:
        # Write the table header
        f.write("| Context | Script Section | Filename |\n")
        f.write("|---------|----------------|----------|\n")
        # Write each row as a markdown table row
        for _, row in df_subset.iterrows():
            f.write(f"| {row['Context']} | {row['Script Section']} | {row['Filename']} |\n")

# Split the data into four tables based on 'Analysis' and 'File Type'
output_dir = "./"  # Ensure this directory exists
os.makedirs(output_dir, exist_ok=True)

main_plot = df[(df['Analysis'] == 'Main') & (df['Type'] == 'Plot')]
main_numeric = df[(df['Analysis'] == 'Main') & (df['Type'] == 'Numeric')]
supplemental_plot = df[(df['Analysis'] == 'Supplemental') & (df['Type'] == 'Plot')]
supplemental_numeric = df[(df['Analysis'] == 'Supplemental') & (df['Type'] == 'Numeric')]

if os.path.exists(os.path.join(output_dir, "main_plot.md")):
    os.remove(os.path.join(output_dir, "main_plot.md"))
    os.remove(os.path.join(output_dir, "main_numeric.md"))
    os.remove(os.path.join(output_dir, "supplemental_plot.md"))
    os.remove(os.path.join(output_dir, "supplemental_numeric.md"))

# Save each subset as a Markdown table
save_table(main_plot, os.path.join(output_dir, "main_plot.md"))
save_table(main_numeric, os.path.join(output_dir, "main_numeric.md"))
save_table(supplemental_plot, os.path.join(output_dir, "supplemental_plot.md"))
save_table(supplemental_numeric, os.path.join(output_dir, "supplemental_numeric.md"))

print("Markdown tables created successfully.")

Markdown tables created successfully.


Everything that follows happens within the Suppelement subfolder of the primary project Repo.

In [1]:
import pandas as pd
import os

# Load the CSV file containing the filenames
file_path = "output_table_file_map.csv"  # Update this path to your actual file location
df = pd.read_csv(file_path)

# Define the desired order for 'Type' and 'Analysis' columns
type_order = ['Summary', 'Plot', 'Numeric']
analysis_order = ['Main', 'Supplemental']
context_order = ['Summary', 'Model Diagnostic', 'MCMC Diagnostic']

# Replace NaNs in 'Type' column with empty string to process them
df['Type'] = df['Type'].fillna('')

# Assign 'Numeric' type to any rows with filenames ending in .csv
df.loc[df['Filename'].str.endswith('.csv'), 'Type'] = 'Numeric'

# Add the 'Context' column and initialize with an empty string
df['Context'] = ''

# Fill 'Context' column based on rules
df.loc[(df['Model'].str.contains('all', case=False, na=False)) | 
       (df['Filename'].str.contains('summary', case=False, na=False)), 'Context'] = 'Summary'

df.loc[df['Filename'].str.contains('tplots|geweke|grrhat', case=False, na=False), 'Context'] = 'MCMC Diagnostic'

df.loc[df['Filename'].str.contains('lppd|loo|resid|outlier', case=False, na=False), 'Context'] = 'Model Diagnostic'

# Convert the columns to categorical with the specified order
df['Analysis'] = pd.Categorical(df['Analysis'], categories=analysis_order, ordered=True)
df['Type'] = pd.Categorical(df['Type'], categories=type_order, ordered=True)
df['Context'] = pd.Categorical(df['Context'], categories=context_order, ordered=True)

# Sort the dataframe by 'Type' and then by 'Analysis'
sorted_df = df.sort_values(by=['Analysis', 'Type', 'Context'])

# Save the sorted dataframe back to a CSV (or modify this to write Markdown if needed)
output_csv_path = "sorted_filenames_metadata.csv"

if os.path.exists(output_csv_path):
    os.remove(output_csv_path)

sorted_df.to_csv(output_csv_path, index=False)

# Generate the markdown table
output_path = "sorted_filenames_metadata.md"

if os.path.exists(output_path):
    os.remove(output_path)

with open(output_path, "w") as f:
    # Write the table header
    f.write("| Analysis | Type | Context | Model | Script Section | Filename |\n")
    f.write("|----------|------|---------|-------|----------------|----------|\n")
    
    # Write each row as a markdown table row
    for _, row in sorted_df.iterrows():
        f.write(f"| {row['Analysis']} | {row['Type']} | {row['Context']} | {row['Model']} | {row['Script Section']} | {row['Filename']} |\n")

sorted_df

KeyboardInterrupt: 

In [None]:
import pandas as pd

# Read the metadata table
metadata_path = "sorted_filenames_metadata.csv"
metadata = pd.read_csv(metadata_path)

# Filter for `tplots_...png` files and maintain order from metadata
tplots_files = metadata[metadata['Filename'].str.contains('tplots_') & metadata['Filename'].str.endswith('.png')]

# Generate the Markdown content
md_content = "# Trace Plots\n\n"
md_content += "This section contains the trace plots (`tplots_...png`) for the MCMC diagnostics.\n\n"

for _, row in tplots_files.iterrows():
    filename = row['Filename']
    relative_path = f"../Output/{filename}"
    md_content += f"### {filename}\n"
    md_content += f"![{filename}]({relative_path})\n\n"

# Write the Markdown content to a new file
output_md_path = "tplots_section.md"

if os.path.exists(output_md_path):
    os.remove(output_md_path)

with open(output_md_path, "w") as md_file:
    md_file.write(md_content)

print(f"Markdown section for tplots created at: {output_md_path}")


Markdown section for tplots created at: tplots_section.md


Create markdown summary table of consolidated model posterior summaries for key scaling variables.

In [1]:
import os
import pandas as pd

# Define the output directory containing the "post_summary..." files
output_dir = "../Output"
markdown_file = "consolidated_post_summary.md"

# Get all the "post_summary..." files
post_summary_files = [
    f for f in os.listdir(output_dir) if f.startswith("post_summary") and f.endswith(".csv")
]

# Prepare a consolidated DataFrame
consolidated_df = pd.DataFrame()

# Loop through each file and extract the relevant rows
for file in post_summary_files:
    file_path = os.path.join(output_dir, file)
    df = pd.read_csv(file_path)
    
    df.rename(columns={df.columns[0]: "param"}, inplace=True)

    # Extract the relevant rows
    filtered_df = df[df['param'].isin(['b0', 'b1', 'intercept', 'scaling'])]
    
    # Add a new column for the model/analysis (taken from the file name)
    filtered_df['model'] = file.replace("post_summary_", "").replace(".csv", "")
    
    # Append to the consolidated DataFrame
    consolidated_df = pd.concat([consolidated_df, filtered_df], ignore_index=True)

# Rearrange the columns
consolidated_df = consolidated_df[['model', 'param', 'lower', 'upper', 'mean', 'stdd']]

# Create the markdown table
markdown_table = consolidated_df.to_markdown(index=False)

# Write the markdown to the file
with open(markdown_file, "w") as f:
    f.write("# Consolidated Parameter Summary\n\n")
    f.write(markdown_table)

print(f"Markdown section written to: {markdown_file}")


Markdown section written to: consolidated_post_summary.md


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['model'] = file.replace("post_summary_", "").replace(".csv", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['model'] = file.replace("post_summary_", "").replace(".csv", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['model'] = file.replace("post_summary_"