# GeoAI buildings extraction
From https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/

In [1]:
from my_nb_utils import (
    get_gpkg_schema,
    compare_gpkg_schemas,
    print_schema_comparison,
    read_and_concat_gpkgs,
    prepare_gdf_for_delta,
    list_gpkg_files,
    get_all_gpkg_data
)

# Get all building footprintsfrom FTP site

In [None]:
df, stats = get_all_gpkg_data()

In [None]:
df

In [None]:
print("\nProcessing Summary:")
print(f"Expected files: {stats['expected_files']}")
print(f"Successfully processed: {stats['successful_reads']}")
print(f"Failed reads: {stats['failed_reads']}")
print(f"Total features: {stats['total_features']}")

print("\nUpload Date Range:")
print(f"Earliest: {stats['upload_date_range']['earliest']}")
print(f"Latest: {stats['upload_date_range']['latest']}")

print("\nDataFrame Info:")
print(df.info())

# Compare schemas

In [2]:
file_paths = [
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/BC_CapeBall_WV03_20210702.zip',
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_Bouctouche_2016.zip',
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/QC_RockForest_WV03_20220930.zip'
]

file_paths = [
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/BC_Masset_WV02_20160607.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/BC_PortClements_GE01_20180930.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_AnseBleue_WV03_20200807.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_Brantville_WV02_20210825.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_DNR_2013_a.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_DNR_2015_a.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_DNR_2017_s.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_DNR_2019_an.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_Fairisle_WV02_20210919_A.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_SNB_2022_g.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/ON_RideauFerry_GE01_20180730.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/ON_TalbotRiver_WV02_20180710.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/PEIGeorgetown.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/QC_Boisbriand_WV02_20210524.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/QC_ZecTawachiche_WV02_20190825_B.zip",
"https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/YT1.zip"]


In [3]:
comparison = compare_gpkg_schemas(file_paths)

In [None]:
print_schema_comparison(comparison)

In [2]:
test_files = [
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/BC_CapeBall_WV03_20210702.zip',
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/NB_Bouctouche_2016.zip',
    'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/QC_RockForest_WV03_20220930.zip'
]

In [None]:
combined_gdf, stats = read_and_concat_gpkgs(test_files)

In [None]:
# Print processing statistics
print("\nProcessing Statistics:")
print(f"Total files processed: {stats['total_files']}")
print(f"Successful reads: {stats['successful_reads']}")
print(f"Failed reads: {stats['failed_reads']}")
print(f"Total features: {stats['total_features']}")

if stats['crs_transforms']:
    print("\nCRS Transformations:")
    for transform in stats['crs_transforms']:
        print(f"File: {transform['file']}")
        print(f"  From: {transform['from']}")
        print(f"  To: {transform['to']}")

if stats['errors']:
    print("\nErrors encountered:")
    for error in stats['errors']:
        print(f"File: {error['file']}")
        print(f"  Error: {error['error']}")

# Display information about the combined GeoDataFrame
print("\nCombined GeoDataFrame Info:")
print(combined_gdf.info())

In [None]:
# Prepare the data for Delta Lake
prepared_df, quality_stats = prepare_gdf_for_delta(combined_gdf)

# Print quality statistics
print("\nData Quality Statistics:")
print(f"Total rows: {quality_stats['total_rows']}")
print(f"Invalid geometries: {quality_stats['invalid_geometries']}")

print("\nNull counts by column:")
for col, count in quality_stats['null_counts'].items():
    if count > 0:
        print(f"  {col}: {count}")

print("\nData types:")
for col, dtype in quality_stats['data_types'].items():
    print(f"  {col}: {dtype}")

# FTP tests

In [None]:
from ftplib import FTP
from datetime import datetime

try:
    # Connect to FTP server
    print("Connecting to FTP server...")
    ftp = FTP('ftp.maps.canada.ca')
    ftp.login()  # anonymous login
    
    print("\nNavigating to GPKG directory...")
    ftp.cwd('/pub/nrcan_rncan/vector/geobase_geoai_geoia/GPKG/')
    
    # Get file list with details
    files = []
    ftp.dir(lambda x: files.append(x))
    
    print("\nGPKG ZIP files and their timestamps:")
    print("-" * 70)
    
    zip_files = []
    for file_info in files:
        if '.zip' in file_info:
            parts = file_info.split()
            # Get file size
            size = parts[4]
            # Combine date parts
            date_str = f"{parts[5]} {parts[6]} {parts[7]}"
            filename = parts[-1]
            print(f"{filename:<50} {date_str:>15} {size:>10} bytes")
            zip_files.append({
                'filename': filename,
                'upload_date': date_str,
                'size': int(size)
            })
    
    print(f"\nTotal ZIP files found: {len(zip_files)}")
    
except Exception as e:
    print(f"Error accessing FTP: {str(e)}")
finally:
    try:
        ftp.quit()
    except:
        pass