In [231]:
import geopandas as gpd
import shapely as sp
import pandas as pd
import numpy as np
import os
from collections import defaultdict
import matplotlib

In [200]:
current = os.getcwd()

In [201]:
inp_folfer = os.path.dirname(current) + "/Data Inputs/CFM Areas Shapefiles/"

In [202]:
out_folder = os.path.dirname(current) + "/Data Outputs/"

In [203]:
# Get a list of immediate subdirectories under base_dir.
# (Assumes that each of these folders is for one province.)

epsgM = 4326
province_dirs = [
    os.path.join(inp_folfer, d)
    for d in os.listdir(inp_folfer)
    if os.path.isdir(os.path.join(inp_folfer, d))
]

# This dictionary will store lists of GeoDataFrames per province.
province_gdfs = defaultdict(list)

# Loop over each province folder.
for province_path in province_dirs:
    province_name = os.path.basename(province_path)
    print(f"Processing province: {province_name}")
    
    # Walk the folder recursively so we catch files in all nested subfolders.
    for root, dirs, files in os.walk(province_path):
        for file in files:
            file_lower = file.lower()
            file_path = os.path.join(root, file)
            
            # Read shapefiles (.shp) and KML files.
            if file_lower.endswith('.shp'):
                try:
                    gdf = gpd.read_file(file_path)
                    print(f"  Loaded shapefile: {file_path}")
                    gdf = gdf.to_crs(epsg = epsgM)
                except Exception as e:
                    print(f"  Error reading shapefile {file_path}: {e}")
                    continue
            elif file_lower.endswith('.kml'):
                try:
                    # Specify the KML driver
                    gdf = gpd.read_file(file_path, driver='KML')
                    print(f"  Loaded KML file: {file_path}")
                except Exception as e:
                    print(f"  Error reading KML file {file_path}: {e}")
                    continue
            else:
                continue  # Skip any other file types
            
            # Add a column for the province (taken from the top folder name).
            gdf["province"] = province_name
            
            # Determine a subarea name from the folder structure.
            # Here we use the relative path from the province folder and,
            # if the file isn’t directly in the province folder, we take the first folder in the relative path.
            rel_path = os.path.relpath(root, province_path)
            if rel_path == ".":
                # File is directly under the province folder.
                gdf["subarea"] = province_name
            else:
                # For example, if the file is in ".../Lumika_CF_shapefiles/Lumika_CF1_shapefiles/",
                # then we take "Lumika_CF_shapefiles" as the subarea.
                subarea = rel_path.split(os.sep)[0]
                gdf["subarea"] = subarea
                
            # Append the GeoDataFrame to the list for this province.
            province_gdfs[province_name].append(gdf)

# Now merge all GeoDataFrames for each province.
merged_province_gdfs = {}
for province, gdf_list in province_gdfs.items():
    if gdf_list:  # make sure there is at least one GeoDataFrame for the province
        try:
            merged_gdf = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True))
            merged_province_gdfs[province] = merged_gdf
            print(f"Merged {len(gdf_list)} files for province '{province}', with a total of {len(merged_gdf)} features.")
        except Exception as e:
            print(f"Error merging GeoDataFrames for province {province}: {e}")

# At this point, merged_province_gdfs is a dictionary with keys equal to your province names,
# and values equal to the merged GeoDataFrame for that province.
# For example, to access the merged GeoDataFrame for "Lumika":
#   lumika_gdf = merged_province_gdfs.get("Lumika")

# --- Optional: Save each merged province GeoDataFrame to a shapefile ---
output_dir = "/Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Outputs/Merged_Provinces"
os.makedirs(output_dir, exist_ok=True)

for province, gdf in merged_province_gdfs.items():
    output_path = os.path.join(output_dir, f"{province}_merged.shp")
    try:
        gdf.to_file(output_path)
        print(f"Saved merged shapefile for province '{province}' to: {output_path}")
    except Exception as e:
        print(f"Error saving merged shapefile for province {province}: {e}")

Processing province: Muchinga Province
  Loaded shapefile: /Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Inputs/CFM Areas Shapefiles/Muchinga Province/Sambaulye_CF_shapefiles/Sambaulye_CF_revised_final2.shp
  Loaded shapefile: /Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Inputs/CFM Areas Shapefiles/Muchinga Province/Kaloswe_CF_shapefiles/Kaloswe CF original_boundary.shp
  Loaded shapefile: /Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Inputs/CFM Areas Shapefiles/Muchinga Province/Mibobo_CF_shapefiles/Mibobo_CFA_shapefile.shp
  Loaded shapefile: /Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Inputs/CFM Areas Shapefiles/Muchinga Province/Kakoko_CF_shapefiles/Kakoko_CF.shp
  Loaded shapefile: /Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Inputs/CFM Areas Shapefiles/Mu

  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  gdf.to_file(output_path)
  ogr_write(
  gdf.to_file(output_path)
  ogr_write(


In [204]:
#read shapefiles
merged_province_gdfs.keys()

dict_keys(['Muchinga Province', 'Northern Province', 'Luapula Province'])

In [205]:
merged_province_gdfs[list(merged_province_gdfs.keys())[2]] = merged_province_gdfs[list(merged_province_gdfs.keys())[2]].explode()

In [206]:
gdf_Luapula_Province = merged_province_gdfs[list(merged_province_gdfs.keys())[2]]
gdf_Muchinga_Province = merged_province_gdfs[list(merged_province_gdfs.keys())[0]]
gdf_Northern_Province = merged_province_gdfs[list(merged_province_gdfs.keys())[1]]

In [209]:
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon

def count_vertices(geom):
    """
    Count vertices for a Polygon or MultiPolygon.
    For a Polygon, counts the number of coordinates in the exterior ring.
    For a MultiPolygon, sums the vertices of all parts.
    """
    if geom.is_empty:
        return 0
    if geom.geom_type == 'Polygon':
        return len(list(geom.exterior.coords))
    elif geom.geom_type == 'MultiPolygon':
        return sum(len(list(poly.exterior.coords)) for poly in geom.geoms)
    else:
        return 0

def assign_overlap_to_more_complex(gdf, geom_col='geometry'):
    """
    For each pair of overlapping features in the GeoDataFrame,
    subtract the overlapping region from the feature that is less complex,
    where complexity is defined as having fewer vertices.
    
    Returns a new GeoDataFrame with an updated geometry column.
    """
    # Work on a copy to avoid modifying the original data
    gdf = gdf.copy().reset_index(drop=True)
    
    # Create a working copy of the geometries
    gdf['updated_geometry'] = gdf[geom_col]
    
    n = len(gdf)
    # Iterate over all pairs (i, j) with i < j
    for i in range(n):
        for j in range(i+1, n):
            geom_i = gdf.at[i, 'updated_geometry']
            geom_j = gdf.at[j, 'updated_geometry']
            
            # If the two geometries intersect...
            if geom_i.intersects(geom_j):
                overlap = geom_i.intersection(geom_j)
                # Continue only if the overlapping area is non-empty
                if not overlap.is_empty:
                    vertices_i = count_vertices(geom_i)
                    vertices_j = count_vertices(geom_j)
                    
                    # If both geometries are valid and have a countable number of vertices,
                    # assign the overlapping part to the one with more vertices.
                    if vertices_i >= vertices_j:
                        # Geometry i is more complex (or equal); subtract overlap from j.
                        new_geom_j = geom_j.difference(overlap)
                        gdf.at[j, 'updated_geometry'] = new_geom_j
                    else:
                        # Geometry j is more complex; subtract overlap from i.
                        new_geom_i = geom_i.difference(overlap)
                        gdf.at[i, 'updated_geometry'] = new_geom_i
    
    # Optionally, you can update the original geometry column:
    gdf[geom_col] = gdf['updated_geometry']
    gdf = gdf.drop(columns=['updated_geometry'])
    
    return gdf

# --- Example usage on one of your province GeoDataFrames ---
gdf_Luapula_Province_fixed = assign_overlap_to_more_complex(gdf_Luapula_Province)
gdf_Muchinga_Province_fixed = assign_overlap_to_more_complex(gdf_Muchinga_Province)
gdf_Northern_Province_fixed = assign_overlap_to_more_complex(gdf_Northern_Province)


# Recheck geometry validity if needed:
gdf_Luapula_Province_fixed['is_valid'] = gdf_Luapula_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Luapula_Province_fixed['is_valid'].value_counts())

gdf_Muchinga_Province_fixed['is_valid'] = gdf_Muchinga_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Muchinga_Province_fixed['is_valid'].value_counts())

gdf_Northern_Province_fixed['is_valid'] = gdf_Northern_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Northern_Province_fixed['is_valid'].value_counts())

is_valid
True    6
Name: count, dtype: int64
is_valid
True    17
Name: count, dtype: int64
is_valid
True    9
Name: count, dtype: int64


In [208]:
gdf_Luapula_Province['is_valid'] = gdf_Luapula_Province['geometry'].apply(lambda x: x.is_valid) 
gdf_Muchinga_Province['is_valid'] = gdf_Muchinga_Province['geometry'].apply(lambda x: x.is_valid) 
gdf_Northern_Province['is_valid'] = gdf_Northern_Province['geometry'].apply(lambda x: x.is_valid) 

#fix irregularities
gdf_Luapula_Province['geometry'] = gdf_Luapula_Province['geometry'].apply(
    lambda geom: geom.buffer(0) if not geom.is_valid else geom
)

gdf_Muchinga_Province['geometry'] = gdf_Muchinga_Province['geometry'].apply(
    lambda geom: geom.buffer(0) if not geom.is_valid else geom
)


gdf_Northern_Province['geometry'] = gdf_Northern_Province['geometry'].apply(
    lambda geom: geom.buffer(0) if not geom.is_valid else geom
)

In [210]:
#fix overlaping features

import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon

def count_vertices(geom):
    """
    Count vertices for a Polygon or MultiPolygon.
    For a Polygon, counts the number of coordinates in the exterior ring.
    For a MultiPolygon, sums the vertices of all parts.
    """
    if geom.is_empty:
        return 0
    if geom.geom_type == 'Polygon':
        return len(list(geom.exterior.coords))
    elif geom.geom_type == 'MultiPolygon':
        return sum(len(list(poly.exterior.coords)) for poly in geom.geoms)
    else:
        return 0

def assign_overlap_to_more_complex(gdf, geom_col='geometry'):
    """
    For each pair of overlapping features in the GeoDataFrame,
    subtract the overlapping region from the feature that is less complex,
    where complexity is defined as having fewer vertices.
    
    Returns a new GeoDataFrame with an updated geometry column.
    """
    # Work on a copy to avoid modifying the original data
    gdf = gdf.copy().reset_index(drop=True)
    
    # Create a working copy of the geometries
    gdf['updated_geometry'] = gdf[geom_col]
    
    n = len(gdf)
    # Iterate over all pairs (i, j) with i < j
    for i in range(n):
        for j in range(i+1, n):
            geom_i = gdf.at[i, 'updated_geometry']
            geom_j = gdf.at[j, 'updated_geometry']
            
            # If the two geometries intersect...
            if geom_i.intersects(geom_j):
                overlap = geom_i.intersection(geom_j)
                # Continue only if the overlapping area is non-empty
                if not overlap.is_empty:
                    vertices_i = count_vertices(geom_i)
                    vertices_j = count_vertices(geom_j)
                    
                    # If both geometries are valid and have a countable number of vertices,
                    # assign the overlapping part to the one with more vertices.
                    if vertices_i >= vertices_j:
                        # Geometry i is more complex (or equal); subtract overlap from j.
                        new_geom_j = geom_j.difference(overlap)
                        gdf.at[j, 'updated_geometry'] = new_geom_j
                    else:
                        # Geometry j is more complex; subtract overlap from i.
                        new_geom_i = geom_i.difference(overlap)
                        gdf.at[i, 'updated_geometry'] = new_geom_i
    
    # Optionally, you can update the original geometry column:
    gdf[geom_col] = gdf['updated_geometry']
    gdf = gdf.drop(columns=['updated_geometry'])
    
    return gdf

# --- Example usage on one of your province GeoDataFrames ---
gdf_Luapula_Province_fixed = assign_overlap_to_more_complex(gdf_Luapula_Province)

# Recheck geometry validity if needed:
gdf_Luapula_Province_fixed['is_valid'] = gdf_Luapula_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Luapula_Province_fixed['is_valid'].value_counts())

gdf_Muchinga_Province_fixed = assign_overlap_to_more_complex(gdf_Muchinga_Province)

# Recheck geometry validity if needed:
gdf_Muchinga_Province_fixed['is_valid'] = gdf_Muchinga_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Muchinga_Province_fixed['is_valid'].value_counts())

gdf_Northern_Province_fixed = assign_overlap_to_more_complex(gdf_Northern_Province)

# Recheck geometry validity if needed:
gdf_Northern_Province_fixed['is_valid'] = gdf_Northern_Province_fixed['geometry'].apply(lambda x: x.is_valid)
print(gdf_Northern_Province_fixed['is_valid'].value_counts())


is_valid
True    6
Name: count, dtype: int64
is_valid
True    17
Name: count, dtype: int64
is_valid
True    9
Name: count, dtype: int64


In [211]:
#standardize 

import geopandas as gpd
import pandas as pd

def standardize_fields(gdf, name_candidates=['Name'], folder_candidate='subarea'):
    """
    Given a GeoDataFrame, this function creates and returns a new GeoDataFrame
    with the following standardized columns:
      - name_original: using the first available candidate from name_candidates.
      - province: (assumed to already exist)
      - name_folder: taken from folder_candidate (assumed to exist).
      - geometry: the geometry column.
      
    Parameters:
        gdf (GeoDataFrame): The input GeoDataFrame.
        name_candidates (list): List of candidate field names to use as the original name.
        folder_candidate (str): Field name to use for the folder name.
        
    Returns:
        GeoDataFrame: A new GeoDataFrame with standardized columns.
    """
    
    # Find the first available name column candidate
    name_field = None
    for candidate in name_candidates:
        if candidate in gdf.columns:
            name_field = candidate
            break
    if name_field is None:
        raise ValueError("None of the name candidate fields were found in the GeoDataFrame.")
    
    # Make a copy to avoid modifying the original
    gdf_std = gdf.copy()
    
    # Create standardized columns
    gdf_std['name_original'] = gdf_std[name_field]
    gdf_std['name_folder'] = gdf_std[folder_candidate]  # Assuming 'subarea' is the folder name.
    # 'province' and 'geometry' are assumed to be already present.
    
    # Return only the standardized columns
    return gdf_std[['name_original', 'province', 'name_folder', 'geometry']]

# --- Standardize each province GeoDataFrame ---

# For Muchinga Province:
std_Muchinga = standardize_fields(
    gdf_Muchinga_Province_fixed,
    name_candidates=['Name', 'NAME', 'Neme']  # add other candidates if needed
)

# For Northern Province:
std_Northern = standardize_fields(
    gdf_Northern_Province_fixed,
    name_candidates=['Name', 'NAME']
)

# For Luapula Province:
std_Luapula = standardize_fields(
    gdf_Luapula_Province_fixed,
    name_candidates=['Name', 'NAME']
)

# --- Append (concatenate) the standardized GeoDataFrames ---
combined_gdf = pd.concat([std_Muchinga, std_Northern, std_Luapula], ignore_index=True)

# Check the result:
print(combined_gdf.head())
print(combined_gdf.columns)

                  name_original           province              name_folder  \
0                          None  Muchinga Province  Sambaulye_CF_shapefiles   
1  Kaloswe_CF_Original_Boundary  Muchinga Province    Kaloswe_CF_shapefiles   
2                    Mibobo CFA  Muchinga Province     Mibobo_CF_shapefiles   
3                     Kakoko CF  Muchinga Province     Kakoko_CF_shapefiles   
4                           NaN  Muchinga Province  Lokomwila_CF_shapefiles   

                                            geometry  
0  POLYGON ((31.76043 -11.68503, 31.76395 -11.686...  
1  POLYGON ((31.17501 -12.24452, 31.22262 -12.266...  
2  POLYGON ((30.52734 -12.32771, 30.56259 -12.360...  
3  POLYGON ((32.20133 -11.22931, 32.2027 -11.23, ...  
4  POLYGON ((30.78484 -11.643, 30.79278 -11.63989...  
Index(['name_original', 'province', 'name_folder', 'geometry'], dtype='object')


In [223]:
combined_gdf['id'] = range(1, len(combined_gdf)+1)

In [213]:
from shapely.geometry import shape, mapping
from shapely.geometry import Polygon, MultiPolygon

def drop_z(geom):
    """Convert a geometry to 2D by stripping off the Z coordinate."""
    if geom.is_empty:
        return geom
    # For Polygon
    if geom.geom_type == 'Polygon':
        # Get the exterior coordinates (dropping Z) and then do the same for interiors
        exterior_2d = [(x, y) for x, y, *rest in geom.exterior.coords]
        interiors_2d = [
            [(x, y) for x, y, *rest in interior.coords]
            for interior in geom.interiors
        ]
        return Polygon(exterior_2d, interiors_2d)
    # For MultiPolygon
    elif geom.geom_type == 'MultiPolygon':
        polygons_2d = [drop_z(p) for p in geom.geoms]
        return MultiPolygon(polygons_2d)
    # For other geometry types, you might extend this function
    else:
        return geom

# Apply the function to the geometry column of your GeoDataFrame:
combined_gdf['geometry'] = combined_gdf['geometry'].apply(drop_z)

In [214]:
projected_combined = combined_gdf.to_crs(epsg = 32736)

#calculate area


In [215]:
projected_combined['area_ha'] = projected_combined.area/10000

In [219]:
projected_combined['area_ha'].describe()

count       32.000000
mean      7778.766401
std      15717.301429
min        107.061293
25%        888.517311
50%       1901.494278
75%       5204.762120
max      67020.364111
Name: area_ha, dtype: float64

In [235]:
projected_combined['id'] = range(1, len(projected_combined)+1)

In [276]:
projected_combined['CF_desc'] = np.where(
    (projected_combined['name_original'].isna()) | 
    (projected_combined['name_original']==None) | 
    (projected_combined['name_original']=='') |
    (projected_combined['name_original']=='BOUNDARY') | 
    (projected_combined['name_original']=='Boundary'), projected_combined['name_folder'], projected_combined['name_original'])

In [273]:
projected_combined.iloc[26]['name_original']

''

In [277]:
projected_combined

Unnamed: 0,name_original,province,name_folder,geometry,area_ha,id,CF_desc
0,,Muchinga Province,Sambaulye_CF_shapefiles,"POLYGON ((364897.199 8707979.042, 365281.363 8...",1565.777832,1,Sambaulye_CF_shapefiles
1,Kaloswe_CF_Original_Boundary,Muchinga Province,Kaloswe_CF_shapefiles,"POLYGON ((301483.628 8645736.909, 306680.022 8...",1047.517062,2,Kaloswe_CF_Original_Boundary
2,Mibobo CFA,Muchinga Province,Mibobo_CF_shapefiles,"POLYGON ((231081.851 8635969.12, 234950.747 86...",2484.858386,3,Mibobo CFA
3,Kakoko CF,Muchinga Province,Kakoko_CF_shapefiles,"POLYGON ((412815.588 8758546.751, 412965.695 8...",21431.252508,4,Kakoko CF
4,,Muchinga Province,Lokomwila_CF_shapefiles,"POLYGON ((258491.409 8711979.634, 259354.829 8...",1887.019005,5,Lokomwila_CF_shapefiles
5,Chintasama_Community_Forest,Muchinga Province,Chintasama_CF_shapefiles,"POLYGON ((243541.953 8625134.475, 244137.72 86...",128.694483,6,Chintasama_Community_Forest
6,Chintasama_Community_Forest,Muchinga Province,Chintasama_CF_shapefiles,"POLYGON ((242608.087 8619719.67, 242092.29 861...",107.061293,7,Chintasama_Community_Forest
7,Chintasama_CF,Muchinga Province,Chintasama_CF_shapefiles,"POLYGON ((243158.103 8618285.155, 242774.253 8...",160.536332,8,Chintasama_CF
8,Chibulika CF,Muchinga Province,Chibulika_CF_shapefiles,"POLYGON ((452137.968 8775608.798, 452462.419 8...",59998.383889,9,Chibulika CF
9,,Muchinga Province,Kakoma_Samu_CF_shapefiles,"POLYGON ((545154.649 8849757.005, 545416.541 8...",251.700493,10,Kakoma_Samu_CF_shapefiles


In [286]:
excel_file = pd.DataFrame(projected_combined.drop(columns=['geometry']))

In [289]:
excel_file.to_excel(out_folder + "community_forest_attributes.xlsx")

In [288]:
output_dir

'/Users/vanessafigueroa/Documents/Personal projects/Deforestation/DeforestationProject/Data Outputs/Merged_Provinces'

In [278]:
projected_combined.to_file(out_folder + 'Community_Forests_Zambia_V1.shp')

  projected_combined.to_file(out_folder + 'Community_Forests_Zambia_V1.shp')
  ogr_write(
  ogr_write(


In [233]:
projected_combined.explore()