In [63]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os

## Define functions - Saves a GeoDataFrame as a shapefile in a directory named after the shapefile

In [64]:
def save_shapefile_within_dir(gdf, shapefile_name):
    """
    Saves a GeoDataFrame as a shapefile in a directory named after the shapefile.
    
    Packages:
    import os
    import Geopandas as gpd

    Parameters:
    gdf (GeoDataFrame): The GeoDataFrame to be saved.
    shapefile_name (str): The name of the shapefile without extension.
    """
    
    # Specify the output directory based on the shapefile name
    output_directory = f"C://Projects//OFLC//Data//{shapefile_name}/"

    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Construct the full path for the output shapefile
    output_shapefile = os.path.join(output_directory, shapefile_name + '.shp')

    # Save the GeoDataFrame to the specified folder as a shapefile
    gdf.to_file(output_shapefile, driver='ESRI Shapefile')

    print(f'Shapefile saved at: {output_shapefile}')

## Load a shapefile in .zip as Geodataframe

In [65]:
# Specify the path to your zip file
zip_file_path = "C://Projects//OFLC//Data//us-county-boundaries.zip"

# Read the shapefile
gdf = gpd.read_file(f"zip://{zip_file_path}")
gdf.head(5)

Unnamed: 0,statefp,countyfp,countyns,geoid,name,namelsad,stusab,lsad,classfp,mtfcc,...,cbsafp,metdivfp,funcstat,aland,awater,intptlat,intptlon,state_name,countyfp_no,geometry
0,39,63,1074044,39063,Hancock,Hancock County,OH,6,H1,G4020,...,22300.0,,A,1376125660,6020647,41.0004711,-83.6660335,Ohio,63,"POLYGON ((-83.88076 41.08036, -83.88076 41.081..."
1,51,179,1480173,51179,Stafford,Stafford County,VA,6,H1,G4020,...,47900.0,47894.0,A,697237202,28088011,38.4132608,-77.4513342,Virginia,179,"POLYGON ((-77.58878 38.50404, -77.58295 38.508..."
2,31,181,835912,31181,Webster,Webster County,NE,6,H1,G4020,...,,,A,1489017962,337241,40.1806461,-98.4985897,Nebraska,181,"POLYGON ((-98.72662 40.29186, -98.72663 40.292..."
3,48,127,1383849,48127,Dimmit,Dimmit County,TX,6,H1,G4020,...,,,A,3441795367,14576535,28.4235871,-99.7658713,Texas,127,"POLYGON ((-99.78085 28.64383, -99.78079 28.643..."
4,19,31,465205,19031,Cedar,Cedar County,IA,6,H1,G4020,...,,,A,1500798712,6385832,41.77236,-91.1326105,Iowa,31,"POLYGON ((-91.36608 41.86007, -91.36620 41.860..."


# Load a CSV file as Dataframe

In [67]:
# Read tabular CSV
df = pd.read_csv('C://Projects//OFLC//Data//CSV//Geography.csv')
df.head(5)

Unnamed: 0,Area,AreaName,StateAb,State,CountyTownName
0,100001,Northwest Alabama nonmetropolitan area,AL,Alabama,Cullman County
1,100001,Northwest Alabama nonmetropolitan area,AL,Alabama,Fayette County
2,100001,Northwest Alabama nonmetropolitan area,AL,Alabama,Franklin County
3,100001,Northwest Alabama nonmetropolitan area,AL,Alabama,Lamar County
4,100001,Northwest Alabama nonmetropolitan area,AL,Alabama,Marion County


## Combine data from two different sources using left join

In [68]:
# merge while "namelsad"="CountyTownName" AND "stusab"="StateAb"
merged_gdf = gdf.merge(df, left_on=['namelsad', 'stusab'], right_on=['CountyTownName', 'StateAb'], how='left')

# save to new shapefile
save_shapefile_within_dir(merged_gdf, 'us-county-boundaries-geography')

print("new shapefile is created")

  gdf.to_file(output_shapefile, driver='ESRI Shapefile')


Shapefile saved at: C://Projects//OFLC//Data//us-county-boundaries-geography/us-county-boundaries-geography.shp
new shapefile is created


## Modify the columns to keep only a few ones

In [69]:
print(list(merged_gdf))

['statefp', 'countyfp', 'countyns', 'geoid', 'name', 'namelsad', 'stusab', 'lsad', 'classfp', 'mtfcc', 'csafp', 'cbsafp', 'metdivfp', 'funcstat', 'aland', 'awater', 'intptlat', 'intptlon', 'state_name', 'countyfp_no', 'geometry', 'Area', 'AreaName', 'StateAb', 'State', 'CountyTownName']


In [70]:
# Select specified columns
columns_to_keep = [
    'geoid', 'Area', 'statefp', 'StateAb', 'stusab', 'State', 'state_name', 'countyfp', 'namelsad',  'CountyTownName', 'AreaName', 'geometry'
]

merged_gdf = merged_gdf[columns_to_keep]
merged_gdf.head(5)

Unnamed: 0,geoid,Area,statefp,StateAb,stusab,State,state_name,countyfp,namelsad,CountyTownName,AreaName,geometry
0,39063,3900001.0,39,OH,OH,Ohio,Ohio,63,Hancock County,Hancock County,West Northwestern Ohio nonmetropolitan area,"POLYGON ((-83.88076 41.08036, -83.88076 41.081..."
1,51179,47900.0,51,VA,VA,Virginia,Virginia,179,Stafford County,Stafford County,"Washington-Arlington-Alexandria, DC-VA-MD-WV","POLYGON ((-77.58878 38.50404, -77.58295 38.508..."
2,31181,3100006.0,31,NE,NE,Nebraska,Nebraska,181,Webster County,Webster County,South Nebraska nonmetropolitan area,"POLYGON ((-98.72662 40.29186, -98.72663 40.292..."
3,48127,4800005.0,48,TX,TX,Texas,Texas,127,Dimmit County,Dimmit County,Border Region of Texas nonmetropolitan area,"POLYGON ((-99.78085 28.64383, -99.78079 28.643..."
4,19031,1900004.0,19,IA,IA,Iowa,Iowa,31,Cedar County,Cedar County,Southeast Iowa nonmetropolitan area,"POLYGON ((-91.36608 41.86007, -91.36620 41.860..."


## Fix the type of the value from floating number to integer

In [71]:
# Convert 'Area' to integer
def safe_int_convert(x):
    try:
        # First, round to nearest whole number
        rounded = np.round(float(x))
        # Then convert to integer
        return int(rounded)
    except ValueError:
        print(f"Warning: Non-numeric value '{x}' found in Area column. Setting to NaN.")
        return np.nan

# Apply the conversion
merged_gdf['Area'] = merged_gdf['Area'].apply(safe_int_convert)

# Check for any remaining NaN values
nan_count = merged_gdf['Area'].isna().sum()
if nan_count > 0:
    print(f"Warning: {nan_count} NaN values found in Area column after conversion.")



In [72]:
# Output to new shapefile
save_shapefile_within_dir(merged_gdf, 'us-county-boundaries-geography-modified')

  gdf.to_file(output_shapefile, driver='ESRI Shapefile')


Shapefile saved at: C://Projects//OFLC//Data//us-county-boundaries-geography-modified/us-county-boundaries-geography-modified.shp


## Manually clean the data (removel columns not being used, insert missing values) within the GIS program

Load the us-county-boundaries-geography-modified.shp into the program for data clearning. Then export to a new us-county-boundaries-geography-modified-fixed.shp file.
Then, read the file in as gpd for further processing...


## Load the .shp as Geodataframe after the data is cleaned using the GIS program

In [35]:
gdf_modified_fixed_load = gpd.read_file(f"C://Projects//OFLC//Data//us-county-boundaries-geography-modified-fixed.shp")
gdf_modified_fixed_load.head(5)

Unnamed: 0,NAME,geoid,Area,statefp,StateAb,stusab,State,state_name,countyfp,namelsad,AreaName,CountyTown,geometry
0,Autauga County,1001,33860.0,1,AL,AL,Alabama,Alabama,1,Autauga County,"Montgomery, AL",Autauga County,"POLYGON ((-86.90310 32.54063, -86.90311 32.540..."
1,Baldwin County,1003,19300.0,1,AL,AL,Alabama,Alabama,3,Baldwin County,"Daphne-Fairhope-Foley, AL",Baldwin County,"POLYGON ((-87.99068 30.55549, -87.99051 30.560..."
2,Barbour County,1005,100004.0,1,AL,AL,Alabama,Alabama,5,Barbour County,Southeast Alabama nonmetropolitan area,Barbour County,"POLYGON ((-85.42982 32.04598, -85.42985 32.046..."
3,Bibb County,1007,13820.0,1,AL,AL,Alabama,Alabama,7,Bibb County,"Birmingham-Hoover, AL",Bibb County,"POLYGON ((-87.31226 33.08622, -87.31218 33.087..."
4,Blount County,1009,13820.0,1,AL,AL,Alabama,Alabama,9,Blount County,"Birmingham-Hoover, AL",Blount County,"POLYGON ((-86.74919 33.99760, -86.74902 33.997..."


## Reorder the column, add the index

In [42]:
# reorder the columns 
columns_to_keep = [
    'geoid', 'Area', 'statefp', 'StateAb', 'stusab', 'State', 'state_name', 'countyfp', 'namelsad',  'CountyTown', 'AreaName', 'geometry'
]
gdf_modified_fixed = gdf_modified_fixed_load[columns_to_keep]

# Sort the GeoDataFrame by 'state_name' and 'countyfp'
gdf_modified_fixed = gdf_modified_fixed.sort_values(['statefp', 'countyfp'])

# Reset the index and add a new index column
gdf_modified_fixed.reset_index(drop=False, inplace=True)  # drop=False retains the old index
gdf_modified_fixed.rename(columns={'index': 'index'}, inplace=True)  # Rename the index column as needed


Unnamed: 0,index,geoid,Area,statefp,StateAb,stusab,State,state_name,countyfp,namelsad,CountyTown,AreaName,geometry
0,0,1001,33860.0,1,AL,AL,Alabama,Alabama,1,Autauga County,Autauga County,"Montgomery, AL","POLYGON ((-86.90310 32.54063, -86.90311 32.540..."
1,1,1003,19300.0,1,AL,AL,Alabama,Alabama,3,Baldwin County,Baldwin County,"Daphne-Fairhope-Foley, AL","POLYGON ((-87.99068 30.55549, -87.99051 30.560..."
2,2,1005,100004.0,1,AL,AL,Alabama,Alabama,5,Barbour County,Barbour County,Southeast Alabama nonmetropolitan area,"POLYGON ((-85.42982 32.04598, -85.42985 32.046..."
3,3,1007,13820.0,1,AL,AL,Alabama,Alabama,7,Bibb County,Bibb County,"Birmingham-Hoover, AL","POLYGON ((-87.31226 33.08622, -87.31218 33.087..."
4,4,1009,13820.0,1,AL,AL,Alabama,Alabama,9,Blount County,Blount County,"Birmingham-Hoover, AL","POLYGON ((-86.74919 33.99760, -86.74902 33.997..."
5,5,1011,100004.0,1,AL,AL,Alabama,Alabama,11,Bullock County,Bullock County,Southeast Alabama nonmetropolitan area,"POLYGON ((-85.92630 32.05468, -85.93051 32.054..."
6,6,1013,100004.0,1,AL,AL,Alabama,Alabama,13,Butler County,Butler County,Southeast Alabama nonmetropolitan area,"POLYGON ((-86.90590 31.75304, -86.90615 31.767..."
7,7,1015,11500.0,1,AL,AL,Alabama,Alabama,15,Calhoun County,Calhoun County,"Anniston-Oxford-Jacksonville, AL","POLYGON ((-85.82158 33.94978, -85.81807 33.948..."
8,8,1017,100002.0,1,AL,AL,Alabama,Alabama,17,Chambers County,Chambers County,Northeast Alabama nonmetropolitan area,"POLYGON ((-85.59333 32.98430, -85.59334 32.984..."
9,9,1019,100002.0,1,AL,AL,Alabama,Alabama,19,Cherokee County,Cherokee County,Northeast Alabama nonmetropolitan area,"POLYGON ((-85.78947 34.08630, -85.78959 34.086..."


## Review the types of Geodataframe and Series while accessing data

In [52]:
type(gdf_modified_fixed.iloc[5])

pandas.core.series.Series

In [53]:
type(gdf_modified_fixed.iloc[5:10])

geopandas.geodataframe.GeoDataFrame

## Output to a final shapefile data

In [62]:
save_shapefile_within_dir(gdf_modified_fixed,'us-county-boundaries-geography-modified-fixed-final')

Shapefile saved at: C://Projects//OFLC//Data//us-county-boundaries-geography-modified-fixed-final/us-county-boundaries-geography-modified-fixed-final.shp
