In [3]:
%pip install geopandas pandas

Note: you may need to restart the kernel to use updated packages.


# Dataset Preprocessing

### 1. Load CSV

In [28]:
import pandas as pd
import os
import geopandas as gpd

filename = "GBIF_CardellinaPusilla.csv"
df = pd.read_csv(f"data/{filename}", sep='\t', lineterminator='\n')

  df = pd.read_csv(f"data/{filename}", sep='\t', lineterminator='\n')


### 2. Remove and rename columns from CSV

In [33]:
# Get the list of all column names from headers
column_headers = list(df.columns.values)
print("Raw Column Headers:", column_headers)

use_cols = ['occurrenceID', 'basisOfRecord', 'eventDate', 'kingdom', 'scientificName', 'taxonRank', 'decimalLatitude', 'decimalLongitude', 'countryCode', 'individualCount']
df_trim = df[use_cols]

column_headers = list(df_trim.columns.values)
print("\nTrimmed Column Headers:", column_headers)

# Rename for shapefile 10 character header limit
df_shp = df_trim.rename(columns={'occurrenceID': 'occID', 'basisOfRecord': 'basis', 'scientificName' : 'sciName', 'decimalLatitude' : 'lat', 'decimalLongitude' : 'lon', 'countryCode' : 'country', 'individualCount' : 'indivCount'})

column_headers = list(df_shp.columns.values)
print("Renamed Column Headers:", column_headers)

Raw Column Headers: ['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'infraspecificEpithet', 'taxonRank', 'scientificName', 'verbatimScientificName', 'verbatimScientificNameAuthorship', 'countryCode', 'locality', 'stateProvince', 'occurrenceStatus', 'individualCount', 'publishingOrgKey', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day', 'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord', 'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber', 'identifiedBy', 'dateIdentified', 'license', 'rightsHolder', 'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted', 'mediaType', 'issue']

Trimmed Column Headers: ['occurrenceID', 'basisOfRecord', 'eventDate', 'kingdom', 'scientificName', 'taxonRank', 'decimalLatitude', 'decimalLongitude', 'countryCode', 'individualCount']
Rename

### 3. Create geopandas dataframe

In [34]:
pa = gpd.GeoDataFrame(df_shp, 
    geometry = gpd.points_from_xy(df_shp['lat'], df_shp['lon']), 
    crs = 'EPSG:4326')

# Inspect the first few rows
print(pa.head())


                                               occID              basis  \
0  https://www.inaturalist.org/observations/13947...  HUMAN_OBSERVATION   
1  https://www.inaturalist.org/observations/10616...  HUMAN_OBSERVATION   
2  https://www.inaturalist.org/observations/11843...  HUMAN_OBSERVATION   
3  https://www.inaturalist.org/observations/34513725  HUMAN_OBSERVATION   
4  https://www.inaturalist.org/observations/12732366  HUMAN_OBSERVATION   

          eventDate   kingdom                              sciName taxonRank  \
0  2022-09-28T12:24  Animalia  Cardellina pusilla (A.Wilson, 1811)   SPECIES   
1  2021-09-30T12:45  Animalia  Cardellina pusilla (A.Wilson, 1811)   SPECIES   
2  2022-05-23T20:54  Animalia  Cardellina pusilla (A.Wilson, 1811)   SPECIES   
3  2019-09-21T10:58  Animalia  Cardellina pusilla (A.Wilson, 1811)   SPECIES   
4  2018-05-20T08:13  Animalia  Cardellina pusilla (A.Wilson, 1811)   SPECIES   

         lat         lon country indivCount                     geom

### 4. Removing duplicates and NaN

We now check that there are no duplicate or `NaN` coordinates, as well as inspect the shapefile's attributes.

In [35]:
print("number of duplicates: ", pa.duplicated(subset='geometry', keep='first').sum())
print("number of NA's: ", pa['geometry'].isna().sum())
print("Coordinate reference system is: {}".format(pa.crs))
print("{} observations with {} columns".format(*pa.shape))

number of duplicates:  1352940
number of NA's:  0
Coordinate reference system is: EPSG:4326
1620779 observations with 11 columns


In [36]:
# Remove duplicates based solely on the 'geometry' column
species_distribution_unique = pa.drop_duplicates(subset=['geometry'])

# Reset index after removing duplicates
species_distribution_unique.reset_index(drop=True, inplace=True)


In [37]:
print("number of duplicates: ", species_distribution_unique.duplicated(subset='geometry', keep='first').sum())
print("number of NA's: ", species_distribution_unique['geometry'].isna().sum())
print("Coordinate reference system is: {}".format(species_distribution_unique.crs))
print("{} observations with {} columns".format(*species_distribution_unique.shape))


number of duplicates:  0
number of NA's:  0
Coordinate reference system is: EPSG:4326
267838 observations with 11 columns


#### 5. Export the trimmed shapefile

We will import the shapefile to `inputs/` for SDM model 

In [41]:
# species_distribution_unique.to_csv("outputs/trimmed_CardellinaPusilla.csv", sep='\t')
species_distribution_unique.to_file('data/CardellinaPusilla.shp', driver='ESRI Shapefile')