In [None]:
import os
import requests
import gzip
import shutil
import pandas as pd

In [None]:
# Define the URL and local paths
url = "https://github.com/rijpma/cathedrals/raw/master/dat/fullobs_sp.csv.gz"  # Use the raw file URL
local_folder = "../Data"
local_gz_path = os.path.join(local_folder, "fullobs_sp.csv.gz")
local_csv_path_1 = os.path.join(local_folder, "fullobs_sp.csv")

# Step 1: Download the .gz file from the GitHub repository
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(local_gz_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {local_gz_path}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")


In [None]:
# Step 2: Unpack the .gz file to the CSV file
with gzip.open(local_gz_path, 'rb') as f_in:
    with open(local_csv_path_1, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    print(f"Unpacked to {local_csv_path_1}")

In [None]:
# Define the URL and local paths
url = "https://github.com/rijpma/cathedrals/raw/refs/heads/master/dat/rurchurches_eb.csv"  # Use the raw file URL
local_folder = "../Data"
local_csv_path_2 = os.path.join(local_folder, "rurchurches_eb.csv")

# Step 1: Download the .gz file from the GitHub repository
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(local_csv_path_2, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {local_csv_path_2}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

In [None]:
# Define the URL and local paths
url = "https://github.com/rijpma/cathedrals/raw/refs/heads/master/dat/dynobs.csv"  # Use the raw file URL
local_folder = "../Data"
local_csv_path_3 = os.path.join(local_folder, "dynobs.csv")

# Step 1: Download the .gz file from the GitHub repository
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(local_csv_path_3, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {local_csv_path_3}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

In [None]:
# File paths
fullobs_path = "../Data/fullobs_sp.csv"
dynobs_path = "../Data/dynobs.csv"

# Step 1: Load `fullobs` table and extract relevant columns
fullobs = pd.read_csv(fullobs_path, low_memory=False)
fullobs_subset = fullobs[['osmid', 'osmname', 'category', 'lat', 'lon']].drop_duplicates()

# Step 2: Load `dynobs` table and extract the earliest construction year for each osmid
dynobs = pd.read_csv(dynobs_path, low_memory=False)

# Drop rows with missing 'osmid' or 'year' (essential for the analysis)
dynobs = dynobs.dropna(subset=['osmid', 'year'])

# Ensure 'osmid' is treated as a string for consistency
dynobs['osmid'] = dynobs['osmid'].astype(str)

# Group by 'osmid' and find the earliest construction event (minimum year)
earliest_events = (
    dynobs.groupby('osmid', as_index=False)
    .agg(year_zero=('year', 'min'))  # Get the earliest year for each building
)

# Step 3: Merge metadata from `fullobs` with earliest construction years from `dynobs`
consolidated_table = pd.merge(
    fullobs_subset,
    earliest_events,
    on='osmid',
    how='left'  # Ensure all `osmid` in `fullobs` are retained, even if no events in `dynobs`
)

# Step 4: Save the consolidated table to a CSV file
consolidated_table.to_csv("../Data/consolidated_table.csv", index=False)

# Print the first few rows of the result for inspection
print(consolidated_table.head())


In [None]:
# Select rows where 'category' is 'parish' and 'year_zero' is greater than 1200
consolidated_table.query("category == 'parish' or category == 'cathedral'")


In [None]:
import chardet

file_path = "../Data/rurchurches_eb.csv"
# Detect the file encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())
    print(result)  # Output will show detected encoding

In [None]:
file_path = "../Data/rurchurches_eb.csv"

# Read the file using the detected encoding
data = pd.read_csv(file_path, encoding='mac_roman', skiprows=1)

# Preview the first few rows
print(data.head())

In [None]:
# Step 1: Forward-fill metadata
data_ff = data.fillna(method='ffill')

# Step 2: Filter rows where 'surface' is 'year'
year_rows = data_ff[data_ff['surface'] == 'year'].copy()

# Step 3: Extract year columns, accounting for the 6-column gap
# The year data starts 6 columns after the 'surface' column
start_idx = 11
year_columns = data_ff.columns[start_idx:]

# Create a DataFrame with osmid and year columns
osmid_years = year_rows[['osmid'] + list(year_columns)]

# Rename columns for clarity
osmid_years.columns = ['osmid'] + [f'year_{i}' for i in range(1, osmid_years.shape[1])]

# Step 4: Reshape year data into long format
years_long = osmid_years.melt(id_vars='osmid', var_name='phase', value_name='year')
years_long = years_long.dropna(subset=['year']).reset_index(drop=True)

# Convert year column to numeric
years_long['year'] = pd.to_numeric(years_long['year'], errors='coerce')

# Step 5: Merge metadata with year data
metadata_rows = data_ff[['osmid', 'osmname', 'category', 'lat', 'lon']].drop_duplicates(subset=['osmid'])
merged = pd.merge(years_long, metadata_rows, on='osmid', how='left')

# Step 6: Save the processed table
output_path = "../Data/rural_churches_processed.csv"
merged.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

In [None]:
# Step 1: Filter rural data for `phase == 'year_1'`
rural_year_1 = merged[merged['phase'] == 'year_1'].copy()

# Step 2: Rename `year` column to `year_zero`
rural_year_1 = rural_year_1.rename(columns={'year': 'year_zero'})

# Step 3: Select relevant columns (matching consolidated structure)
rural_year_1 = rural_year_1[['osmid', 'osmname', 'category', 'lat', 'lon', 'year_zero']]

# Step 4: Combine the two dataframes
combined_df = pd.concat([consolidated_table, rural_year_1], ignore_index=True)

# Step 5: Save the combined dataframe
output_path = "../Data/churches_combined.csv"
combined_df.to_csv(output_path, index=False)
print(f"Combined data saved to {output_path}")

In [None]:
import geopandas as gpd

# Filter the consolidated table
filtered_table = combined_df.query("category == 'parish' or category == 'cathedral'")

# Convert to a GeoDataFrame
gdf = gpd.GeoDataFrame(
    filtered_table,
    geometry=gpd.points_from_xy(filtered_table['lon'], filtered_table['lat']),
    crs="EPSG:4326"  # Set the coordinate reference system to WGS 84
)

# Write to a GeoPackage
output_path = "../Output/churches_combined_gis.gpkg"
gdf.to_file(output_path, layer="filtered_table", driver="GPKG")

print(f"GeoPackage written to {output_path}")