In [57]:
!pip install gitpython




In [1]:
!pip install geopandas dbfread pyshp


Collecting dbfread
  Downloading dbfread-2.0.7-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading dbfread-2.0.7-py2.py3-none-any.whl (20 kB)
Installing collected packages: dbfread
Successfully installed dbfread-2.0.7


In [4]:
import zipfile
import os

# Unzip the file
zip_path = '/content/tl_2024_us_county (2).zip'  # Adjust if your file name is different
extract_path = '/content/tl_2024_us_county'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [5]:
import geopandas as gpd
import pandas as pd
from dbfread import DBF

# Path to the shapefile and DBF
shapefile_path = '/content/tl_2024_us_county/tl_2024_us_county.shp'
dbf_path = '/content/tl_2024_us_county/tl_2024_us_county.dbf'

# Load the shapefile for geometry and DBF for attributes
try:
    gdf = gpd.read_file(shapefile_path)
    washington_counties = gdf[gdf['STATEFP'] == '53']
    print("Filtered counties (geometry included):", washington_counties.shape)
    # Save the filtered shapefile
    washington_counties.to_file('/content/washington_counties.shp')
except Exception as e:
    print("Error with shapefile, falling back to DBF processing.")
    records = list(DBF(dbf_path))
    attributes_df = pd.DataFrame(records)
    washington_attributes = attributes_df[attributes_df['STATEFP'] == '53']
    print("Filtered counties (attributes only):", washington_attributes.shape)
    washington_attributes.to_csv('/content/washington_counties.csv', index=False)


Filtered counties (geometry included): (39, 19)


In [7]:
# Install the required libraries (if not already installed)
!pip install geopandas

import geopandas as gpd

# Define the file paths
input_shapefile = "/content/washington_counties.shp"  # Ensure this points to your .shp file
output_geopackage = "/content/washington_counties.gpkg"  # Output GeoPackage
output_geojson = "/content/washington_counties.geojson"  # Output GeoJSON

# Load the shapefile
gdf = gpd.read_file(input_shapefile)

# Reproject to WGS84 (EPSG:4326)
gdf = gdf.to_crs("EPSG:4326")

# Save as GeoPackage
gdf.to_file(output_geopackage, driver="GPKG")

# Save as GeoJSON
gdf.to_file(output_geojson, driver="GeoJSON")

# Print confirmation
print("Shapefile successfully reprojected and saved!")


Shapefile successfully reprojected and saved!


In [13]:
## Install the required library
!pip install geopandas pandas

import geopandas as gpd
import pandas as pd

# Define file paths
geojson_path = "/content/washington_counties.geojson"  # Path to your GeoJSON file
population_data_path = "/County-Wise_Population_Data.csv"  # Path to your Population CSV
income_data_path = "/County-Wise_Income_Data.csv"  # Path to your Income CSV

# Load the GeoJSON file
gdf = gpd.read_file(geojson_path)

# Load the Population and Income data
population_df = pd.read_csv(population_data_path)
income_df = pd.read_csv(income_data_path)

# Simplify the geometry to avoid errors
gdf['geometry'] = gdf['geometry'].simplify(0.01)

# Merge Population Data
merged_gdf = gdf.merge(population_df, left_on="NAME", right_on="County", how="left")

# Merge Income Data
merged_gdf = merged_gdf.merge(income_df, left_on="NAME", right_on="County", how="left")

# Save the merged GeoDataFrame as a new GeoJSON file
merged_geojson_path = "/content/washington_counties_merged.geojson"
merged_gdf.to_file(merged_geojson_path, driver="GeoJSON")

# Save the merged GeoDataFrame as a new GeoPackage file (Optional)
merged_geopackage_path = "/content/washington_counties_merged.gpkg"
merged_gdf.to_file(merged_geopackage_path, driver="GPKG")

# Print confirmation
print(f"Merged GeoJSON saved at: {merged_geojson_path}")
print(f"Merged GeoPackage saved at: {merged_geopackage_path}")


Merged GeoJSON saved at: /content/washington_counties_merged.geojson
Merged GeoPackage saved at: /content/washington_counties_merged.gpkg


In [19]:
import geopandas as gpd

# Load the original GeoPackage
validated_gdf = gpd.read_file("/Desktop.gpkg")

# Check the first few rows to confirm the data is loaded
print(validated_gdf.head())


  STATEFP COUNTYFP  COUNTYNS  GEOID         GEOIDFQ       NAME  \
0      53      069  01513275  53069  0500000US53069  Wahkiakum   
1      53      041  01531927  53041  0500000US53041      Lewis   
2      53      031  01531936  53031  0500000US53031  Jefferson   
3      53      013  01513273  53013  0500000US53013   Columbia   
4      53      037  01531926  53037  0500000US53037   Kittitas   

           NAMELSAD LSAD CLASSFP  MTFCC  ...     AWATER     INTPTLAT  \
0  Wahkiakum County   06      H1  G4020  ...   61564428  +46.2946377   
1      Lewis County   06      H1  G4020  ...   86929743  +46.5800714   
2  Jefferson County   06      H1  G4020  ...  994992501  +47.8057075   
3   Columbia County   06      H1  G4020  ...   12561759  +46.2928505   
4   Kittitas County   06      H1  G4020  ...   92507199  +47.1244411   

       INTPTLON Unnamed: 0_x  County_x  Population Unnamed: 0_y County_y  \
0  -123.4244583          NaN      None         NaN          NaN     None   
1  -122.3774443   

In [20]:
# Simplify the geometry
simplified_gdf = validated_gdf.copy()
simplified_gdf['geometry'] = simplified_gdf['geometry'].simplify(0.01)

# Confirm the simplification is applied
print(simplified_gdf.head())


  STATEFP COUNTYFP  COUNTYNS  GEOID         GEOIDFQ       NAME  \
0      53      069  01513275  53069  0500000US53069  Wahkiakum   
1      53      041  01531927  53041  0500000US53041      Lewis   
2      53      031  01531936  53031  0500000US53031  Jefferson   
3      53      013  01513273  53013  0500000US53013   Columbia   
4      53      037  01531926  53037  0500000US53037   Kittitas   

           NAMELSAD LSAD CLASSFP  MTFCC  ...     AWATER     INTPTLAT  \
0  Wahkiakum County   06      H1  G4020  ...   61564428  +46.2946377   
1      Lewis County   06      H1  G4020  ...   86929743  +46.5800714   
2  Jefferson County   06      H1  G4020  ...  994992501  +47.8057075   
3   Columbia County   06      H1  G4020  ...   12561759  +46.2928505   
4   Kittitas County   06      H1  G4020  ...   92507199  +47.1244411   

       INTPTLON Unnamed: 0_x  County_x  Population Unnamed: 0_y County_y  \
0  -123.4244583          NaN      None         NaN          NaN     None   
1  -122.3774443   

In [21]:
# Save the simplified GeoPackage
simplified_path = "/content/simplified_washington_counties.gpkg"
simplified_gdf.to_file(simplified_path, driver="GPKG")

print("Simplified file saved successfully!")


Simplified file saved successfully!


In [22]:
import os

# Check if the file exists
print("File exists:", os.path.exists(simplified_path))


File exists: True


In [23]:
from google.colab import files

# Download the saved GeoPackage
files.download(simplified_path)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
import geopandas as gpd

# Load the simplified GeoPackage
gdf = gpd.read_file("/content/simplified_washington_counties.gpkg")

# Check for invalid geometries
gdf['is_valid'] = gdf.is_valid

# Inspect invalid geometries
invalid_geometries = gdf[~gdf['is_valid']]
print("Invalid geometries:", invalid_geometries)


Invalid geometries: Empty GeoDataFrame
Columns: [STATEFP, COUNTYFP, COUNTYNS, GEOID, GEOIDFQ, NAME, NAMELSAD, LSAD, CLASSFP, MTFCC, CSAFP, CBSAFP, METDIVFP, FUNCSTAT, ALAND, AWATER, INTPTLAT, INTPTLON, Unnamed: 0_x, County_x, Population, Unnamed: 0_y, County_y, Median Income, geometry, is_valid]
Index: []

[0 rows x 26 columns]


In [25]:
validated_gdf.to_file("/content/final_validated_washington_counties.gpkg", driver="GPKG")


In [27]:
import geopandas as gpd
import pandas as pd

# Load the validated GeoPackage file
validated_gdf = gpd.read_file("/content/final_validated_washington_counties.gpkg")

# Load population and income data
population_df = pd.read_csv("/County-Wise_Population_Data.csv")
income_df = pd.read_csv("/County-Wise_Income_Data.csv")

# Inspect the columns to confirm merge keys
print(validated_gdf.columns)
print(population_df.columns)
print(income_df.columns)

# Merge population data with GeoDataFrame
merged_gdf = validated_gdf.merge(population_df, left_on="NAME", right_on="County", how="left")

# Drop redundant columns before merging income data
merged_gdf = merged_gdf.drop(columns=["County", "Unnamed: 0"])

# Merge income data with the resulting GeoDataFrame
merged_gdf = merged_gdf.merge(income_df, left_on="NAME", right_on="County", how="left")

# Save the final merged file
merged_gdf.to_file("/content/final_merged_washington_data.gpkg", driver="GPKG")
merged_gdf.to_file("/content/final_merged_washington_data.geojson", driver="GeoJSON")

print("Merged data saved successfully!")


Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'GEOIDFQ', 'NAME',
       'NAMELSAD', 'LSAD', 'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'Unnamed: 0_x',
       'County_x', 'Population', 'Unnamed: 0_y', 'County_y', 'Median Income',
       'geometry'],
      dtype='object')
Index(['Unnamed: 0', 'County', 'Population'], dtype='object')
Index(['Unnamed: 0', 'County', 'Median Income'], dtype='object')
Merged data saved successfully!


In [28]:
print(merged_gdf.isnull().sum())


STATEFP             0
COUNTYFP            0
COUNTYNS            0
GEOID               0
GEOIDFQ             0
NAME                0
NAMELSAD            0
LSAD                0
CLASSFP             0
MTFCC               0
CSAFP              19
CBSAFP             10
METDIVFP           36
FUNCSTAT            0
ALAND               0
AWATER              0
INTPTLAT            0
INTPTLON            0
Unnamed: 0_x       39
County_x           39
Population_x       39
Unnamed: 0_y       39
County_y           39
Median Income_x    39
geometry            0
Population_y       39
Unnamed: 0         39
County             39
Median Income_y    39
dtype: int64


In [29]:
# Inspect unique county names in each dataset
print(merged_gdf["NAME"].unique())  # From GeoDataFrame
print(population_df["County"].unique())  # From Population CSV
print(income_df["County"].unique())  # From Income CSV


['Wahkiakum' 'Lewis' 'Jefferson' 'Columbia' 'Kittitas' 'Snohomish'
 'Yakima' 'Okanogan' 'Spokane' 'Island' 'Clallam' 'Stevens' 'Pend Oreille'
 'Whatcom' 'Chelan' 'Asotin' 'Ferry' 'Douglas' 'Benton' 'Garfield' 'King'
 'Pacific' 'Klickitat' 'San Juan' 'Grays Harbor' 'Cowlitz' 'Walla Walla'
 'Grant' 'Adams' 'Clark' 'Lincoln' 'Kitsap' 'Whitman' 'Mason' 'Skagit'
 'Pierce' 'Thurston' 'Franklin' 'Skamania']
['Adams County, Washington' 'Asotin County, Washington'
 'Benton County, Washington' 'Chelan County, Washington'
 'Clallam County, Washington' 'Clark County, Washington'
 'Columbia County, Washington' 'Cowlitz County, Washington'
 'Douglas County, Washington' 'Ferry County, Washington'
 'Franklin County, Washington' 'Garfield County, Washington'
 'Grant County, Washington' 'Grays Harbor County, Washington'
 'Island County, Washington' 'Jefferson County, Washington'
 'King County, Washington' 'Kitsap County, Washington'
 'Kittitas County, Washington' 'Klickitat County, Washington'
 'Lewis C

In [30]:
# Standardize the County column in both population and income data
population_df["County"] = (
    population_df["County"]
    .str.replace(" County, Washington", "", regex=False)
    .str.strip()
)

income_df["County"] = (
    income_df["County"]
    .str.replace(" County, Washington", "", regex=False)
    .str.strip()
)


In [31]:
validated_gdf["NAME"] = validated_gdf["NAME"].str.strip().str.title()


In [33]:
# Drop conflicting columns from the GeoDataFrame (if present)
merged_gdf = merged_gdf.drop(columns=["County_x", "Unnamed: 0_x"], errors="ignore")


In [34]:
# Merge population data with the GeoDataFrame
merged_gdf = validated_gdf.merge(
    population_df, left_on="NAME", right_on="County", how="left", suffixes=("_geo", "_pop")
)

# Merge income data with the resulting GeoDataFrame
merged_gdf = merged_gdf.merge(
    income_df, left_on="NAME", right_on="County", how="left", suffixes=("_geo", "_inc")
)


In [35]:
print(merged_gdf.columns)


Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'GEOIDFQ', 'NAME',
       'NAMELSAD', 'LSAD', 'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'Unnamed: 0_x',
       'County_x', 'Population_geo', 'Unnamed: 0_y', 'County_y',
       'Median Income_geo', 'geometry', 'Unnamed: 0_geo', 'County_geo',
       'Population_pop', 'Unnamed: 0_inc', 'County_inc', 'Median Income_inc'],
      dtype='object')


In [36]:
# Keep only the necessary columns
final_gdf = merged_gdf[[
    "NAME", "Population_pop", "Median Income_inc", "geometry"
]]


In [37]:
# Rename the columns
final_gdf = final_gdf.rename(columns={
    "NAME": "County",
    "Population_pop": "Population",
    "Median Income_inc": "Median_Income"
})


In [38]:
print(final_gdf.isnull().sum())


County            0
Population        0
Median_Income    19
geometry          0
dtype: int64


In [39]:
# List counties with missing Median_Income
missing_counties = final_gdf[final_gdf["Median_Income"].isnull()]
print(missing_counties[["County", "Population", "Median_Income"]])


          County  Population  Median_Income
0      Wahkiakum        4573            NaN
2      Jefferson       33313            NaN
3       Columbia        3996            NaN
4       Kittitas       44736            NaN
7       Okanogan       42811            NaN
11       Stevens       47470            NaN
12  Pend Oreille       13811            NaN
15        Asotin       22424            NaN
16         Ferry        7326            NaN
17       Douglas       43733            NaN
19      Garfield        2326            NaN
21       Pacific       23750            NaN
22     Klickitat       23082            NaN
23      San Juan       18266            NaN
26   Walla Walla       62102            NaN
28         Adams       20690            NaN
30       Lincoln       11271            NaN
32       Whitman       47042            NaN
38      Skamania       12276            NaN


In [43]:
import os
print(os.listdir())  # Lists all files in the current directory


['.config', 'washington_counties_merged.gpkg', 'simplified_washington_counties.gpkg', 'washington_counties_merged.geojson', 'final_merged_washington_data.gpkg', 'washington_counties.dbf', 'washington_counties.gpkg', 'tl_2024_us_county (2).zip', 'washington_counties.cpg', 'washington_counties.prj', 'washington_counties.shp', 'tl_2024_us_county', 'washington_counties.geojson', 'final_merged_washington_data.geojson', 'washington_counties.shx', 'final_validated_washington_counties.gpkg', 'sample_data']


In [44]:
import os

# Replace 'final_dataset' with a part of your file's name
file_name = "final_dataset"
search_path = "/content"  # Replace with the root path of your environment

for root, dirs, files in os.walk(search_path):
    for file in files:
        if file_name in file:
            print("Found:", os.path.join(root, file))


In [45]:
final_gdf.to_csv('/content/final_dataset.csv', index=False)


In [51]:
import pandas as pd

# Define the missing median income data
missing_income_data = {
    "County": [
        "Wahkiakum", "Jefferson", "Columbia", "Kittitas", "Adams", "Asotin",
        "Douglas", "Ferry", "Klickitat", "Lincoln", "Okanogan", "Pacific",
        "Pend Oreille", "San Juan", "Skamania", "Stevens", "Walla Walla", "Whitman","Garfield"
    ],
    "Median_Income": [
        57091, 71143, 71528, 69928, 65042, 69107, 80374, 54650, 70400, 71227,
        60293, 62350, 63750, 83682, 90085, 67405, 72212, 52893,62411
    ]
}

# Convert missing income data into a DataFrame
missing_income_df = pd.DataFrame(missing_income_data)

# Load your existing dataset (final_gdf)
# Replace 'path_to_your_dataset' with the actual path to your GeoDataFrame file
final_gdf = pd.read_csv('/content/final_dataset.csv')  # Replace this with your dataset loading logic

# Merge the missing income data into the existing dataset
final_gdf = final_gdf.merge(
    missing_income_df,
    on="County",
    how="left",
    suffixes=('', '_updated')
)

# Update the "Median_Income" column with the new values where applicable
final_gdf["Median_Income"] = final_gdf["Median_Income"].combine_first(final_gdf["Median_Income_updated"])

# Drop the extra column used for updating
final_gdf = final_gdf.drop(columns=["Median_Income_updated"])

# Save the updated dataset
final_gdf.to_csv('path_to_updated_dataset.csv', index=False)  # Replace with the desired save path
print("Updated dataset saved successfully.")


Updated dataset saved successfully.


In [52]:
# Preview the first 5 rows
print(final_gdf.head())

# Check for null values
print(final_gdf.isnull().sum())

# Summary statistics
print(final_gdf.describe())


      County  Population  Median_Income  \
0  Wahkiakum        4573        57091.0   
1      Lewis       83925        67169.0   
2  Jefferson       33313        71143.0   
3   Columbia        3996        71528.0   
4   Kittitas       44736        69928.0   

                                            geometry  
0  POLYGON ((-123.727552 46.264495, -123.726557 4...  
1  POLYGON ((-122.932234 46.387406, -123.358334 4...  
2  POLYGON ((-122.952955 48.269798, -122.952386 4...  
3  POLYGON ((-118.24264 46.559747, -118.228013 46...  
4  POLYGON ((-120.002143 46.99732, -119.965804 46...  
County           0
Population       0
Median_Income    0
geometry         0
dtype: int64
         Population  Median_Income
count  3.900000e+01      39.000000
mean   1.984868e+05   76298.820513
std    4.013804e+05   14865.068437
min    2.326000e+03   52893.000000
25%    2.275300e+04   67195.000000
50%    6.696800e+04   71528.000000
75%    1.703155e+05   84024.000000
max    2.262713e+06  120824.000000


In [55]:
from shapely.wkt import loads
import geopandas as gpd

# Convert the 'geometry' column to valid Shapely geometry objects
final_gdf['geometry'] = final_gdf['geometry'].apply(loads)

# Convert the DataFrame to a GeoDataFrame
final_gdf = gpd.GeoDataFrame(final_gdf, geometry='geometry')

# Save as GeoPackage
final_gdf.to_file('final_cleaned_data.gpkg', driver='GPKG')

# Save as GeoJSON (optional)
final_gdf.to_file('final_cleaned_data.geojson', driver='GeoJSON')

print("GeoPackage and GeoJSON files have been successfully saved!")




GeoPackage and GeoJSON files have been successfully saved!


  write(
  write(


In [56]:
# Assign a CRS (e.g., EPSG:4326 for WGS84)
final_gdf.set_crs(epsg=4326, inplace=True)

# Save the GeoPackage
final_gdf.to_file('final_cleaned_data_with_crs.gpkg', driver='GPKG')

# Save the GeoJSON
final_gdf.to_file('final_cleaned_data_with_crs.geojson', driver='GeoJSON')

print("Files with CRS have been successfully saved!")


Files with CRS have been successfully saved!
