# Project : Chicago Luxury Effect


# EDA

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import warnings
warnings.filterwarnings("ignore")
from eda import *

In [17]:
df = pd.read_csv('data/ebd_US-IL_200801_201212_relJan-2024.txt',sep='\t')

## Filtering eBird Dataset

In [18]:
# Filter columns
req_cols = ['CATEGORY', 'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'EXOTIC CODE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'PROTOCOL TYPE', 'ALL SPECIES REPORTED']
df = filter_columns(df,req_cols)

We will keep only species level observations (removing subspecies and genus level observations). We will also filter out incomplete checklists and incidental observations to manage bias towards specific species.

In [19]:
df = filter_rows(df)
df.head()

Unnamed: 0,CATEGORY,COMMON NAME,SCIENTIFIC NAME,OBSERVATION COUNT,EXOTIC CODE,LATITUDE,LONGITUDE,OBSERVATION DATE,PROTOCOL TYPE,ALL SPECIES REPORTED
1,species,American Crow,Corvus brachyrhynchos,X,,38.850907,-89.256706,2008-01-01,Traveling,1
2,species,American Goldfinch,Spinus tristis,X,,38.850907,-89.256706,2008-01-01,Traveling,1
3,species,American Kestrel,Falco sparverius,X,,38.850907,-89.256706,2008-01-01,Traveling,1
4,species,Bald Eagle,Haliaeetus leucocephalus,X,,38.850907,-89.256706,2008-01-01,Traveling,1
5,species,Blue Jay,Cyanocitta cristata,X,,38.850907,-89.256706,2008-01-01,Traveling,1


## ebird Dataset Transformation

In [20]:
# Native column: 1 = is native to chicago, 0 = not native to chicago
df['NATIVE'] = df['EXOTIC CODE'].apply(lambda row:0 if np.isnan(row) else 1)

In [21]:
# Assume all 'X' observations have a count of 1 bird
df['COUNT'] = df['OBSERVATION COUNT'].apply(lambda row: 1 if row == 'X' else row)

In [22]:
# remove unnecessary columns
req_cols = ['COMMON NAME', 'SCIENTIFIC NAME', 'NATIVE', 'COUNT', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE']
df = filter_columns(df,req_cols)
df.head(5)

Unnamed: 0,COMMON NAME,SCIENTIFIC NAME,NATIVE,COUNT,LATITUDE,LONGITUDE,OBSERVATION DATE
1,American Crow,Corvus brachyrhynchos,1,1,38.850907,-89.256706,2008-01-01
2,American Goldfinch,Spinus tristis,1,1,38.850907,-89.256706,2008-01-01
3,American Kestrel,Falco sparverius,1,1,38.850907,-89.256706,2008-01-01
4,Bald Eagle,Haliaeetus leucocephalus,1,1,38.850907,-89.256706,2008-01-01
5,Blue Jay,Cyanocitta cristata,1,1,38.850907,-89.256706,2008-01-01


## Aggregate eBird data based on neighborhood

In [24]:
com_areas = gpd.read_file('data/neighborhoods/geo_export_f5325bf0-9c6d-49a5-a5d9-0e5bf24fa856.shp')

In [25]:
com_areas = filter_columns(com_areas,['community','geometry'])
com_areas.head()

Unnamed: 0,community,geometry
0,DOUGLAS,"POLYGON ((-87.60914 41.84469, -87.60915 41.844..."
1,OAKLAND,"POLYGON ((-87.59215 41.81693, -87.59231 41.816..."
2,FULLER PARK,"POLYGON ((-87.62880 41.80189, -87.62879 41.801..."
3,GRAND BOULEVARD,"POLYGON ((-87.60671 41.81681, -87.60670 41.816..."
4,KENWOOD,"POLYGON ((-87.59215 41.81693, -87.59215 41.816..."


In [26]:
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df['LONGITUDE'], df['LATITUDE'])]
geo_df = gpd.GeoDataFrame(df, geometry=geometry)
geo_df.head()

Unnamed: 0,COMMON NAME,SCIENTIFIC NAME,NATIVE,COUNT,LATITUDE,LONGITUDE,OBSERVATION DATE,geometry
1,American Crow,Corvus brachyrhynchos,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091)
2,American Goldfinch,Spinus tristis,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091)
3,American Kestrel,Falco sparverius,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091)
4,Bald Eagle,Haliaeetus leucocephalus,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091)
5,Blue Jay,Cyanocitta cristata,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091)


In [29]:
ebird_gdf = gpd.sjoin(geo_df, com_areas, how='left', op='within')
ebird_gdf.head()

Unnamed: 0,COMMON NAME,SCIENTIFIC NAME,NATIVE,COUNT,LATITUDE,LONGITUDE,OBSERVATION DATE,geometry,index_right,community
1,American Crow,Corvus brachyrhynchos,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091),,
2,American Goldfinch,Spinus tristis,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091),,
3,American Kestrel,Falco sparverius,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091),,
4,Bald Eagle,Haliaeetus leucocephalus,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091),,
5,Blue Jay,Cyanocitta cristata,1,1,38.850907,-89.256706,2008-01-01,POINT (-89.25671 38.85091),,


In [30]:
# remove observations outside of Chicago and also remove more unnecessary columns
ebird_gdf = ebird_gdf[ebird_gdf['community'].notna()]
ebird_gdf = ebird_gdf.drop(columns=["index_right","LATITUDE","LONGITUDE","OBSERVATION DATE", "COMMON NAME"])

In [31]:
ebird_gdf['COUNT'] = ebird_gdf['COUNT'].astype(int)

In [32]:
# aggregate 
def agg_comm(series):
    return list(series)

grouped = ebird_gdf.groupby('community').agg(agg_comm)
grouped['NATIVE'] = grouped['NATIVE'].apply(lambda x: sum(x))

In [33]:
grouped = grouped.drop(columns=['geometry'])
grouped.head()

Unnamed: 0_level_0,SCIENTIFIC NAME,NATIVE,COUNT
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALBANY PARK,"[Spinus tristis, Megaceryle alcyon, Cyanocitta...",1281,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, ..."
ARMOUR SQUARE,"[Turdus migratorius, Branta canadensis, Aegoli...",35,"[1, 1, 1, 1, 6, 3, 1, 1, 5, 2, 5, 1, 1, 1, 1, ..."
AUSTIN,"[Branta canadensis, Branta canadensis, Dryobat...",16232,"[375, 380, 1, 19, 2, 1, 6, 7, 1, 1, 48, 6, 1, ..."
AVONDALE,"[Corvus brachyrhynchos, Branta canadensis, Buc...",97,"[1, 45, 1, 12, 1, 1, 7, 3, 1, 1, 2, 34, 1, 77,..."
BELMONT CRAGIN,"[Setophaga ruticilla, Setophaga ruticilla, Tur...",123,"[2, 2, 10, 10, 1, 1, 2, 2, 1, 1, 2, 2, 10, 10,..."


In [34]:
com_areas = com_areas.to_crs(epsg=3857)
com_areas["Area"] = com_areas['geometry'].area / 10 ** 6

In [35]:
com_area_size = com_areas[["community","Area"]]
grouped = grouped.merge(com_area_size,on="community",how="left")
grouped.head(5)

Unnamed: 0,community,SCIENTIFIC NAME,NATIVE,COUNT,Area
0,ALBANY PARK,"[Spinus tristis, Megaceryle alcyon, Cyanocitta...",1281,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, ...",9.004359
1,ARMOUR SQUARE,"[Turdus migratorius, Branta canadensis, Aegoli...",35,"[1, 1, 1, 1, 6, 3, 1, 1, 5, 2, 5, 1, 1, 1, 1, ...",4.651178
2,AUSTIN,"[Branta canadensis, Branta canadensis, Dryobat...",16232,"[375, 380, 1, 19, 2, 1, 6, 7, 1, 1, 48, 6, 1, ...",33.432414
3,AVONDALE,"[Corvus brachyrhynchos, Branta canadensis, Buc...",97,"[1, 45, 1, 12, 1, 1, 7, 3, 1, 1, 2, 34, 1, 77,...",9.289855
4,BELMONT CRAGIN,"[Setophaga ruticilla, Setophaga ruticilla, Tur...",123,"[2, 2, 10, 10, 1, 1, 2, 2, 1, 1, 2, 2, 10, 10,...",18.324439


## Join eBird and census datasets

In [36]:
census_df = pd.read_csv("data/Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012_20240228.csv")

In [37]:
census_df['COMMUNITY AREA NAME'] = census_df['COMMUNITY AREA NAME'].str.upper()

In [None]:
final_df = census_df.merge(grouped, left_on='COMMUNITY AREA NAME', right_on='community')
final_df = final_df.rename(columns={'PER CAPITA INCOME ': 'PER CAPITA INCOME'})

In [40]:
final_df["PER CAPITA INCOME IN K"] = final_df.apply(lambda x: x["PER CAPITA INCOME"] / 1000, axis=1)
final_df["CountPerSqkm"] = final_df.apply(lambda x: x["NATIVE"] / x["Area"],axis=1)

In [41]:
final_df["PovertyFlag"] = final_df.apply(lambda x: "Poor" if x["PER CAPITA INCOME"] < 40000 else "Rich", axis=1)

In [44]:
def shannon_index(species_abundance):
    total_count = sum(species_abundance)
    proportions = [count / total_count for count in species_abundance]
    shannon_index = -sum(p * np.log(p) for p in proportions if p != 0)
    return shannon_index

In [45]:
final_df["shannon_index"] = final_df["COUNT"].apply(shannon_index)
final_df.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX,community,SCIENTIFIC NAME,NATIVE,COUNT,Area,PER CAPITA INCOME IN K,CountPerSqkm,PovertyFlag,shannon_index
0,1.0,ROGERS PARK,7.7,23.6,8.7,18.2,27.5,23939,39.0,ROGERS PARK,"[Spinus tristis, Turdus migratorius, Turdus mi...",676,"[1, 46, 4, 9, 1, 3, 1, 9, 12, 1, 2, 1, 2, 1, 7...",8.631616,23.939,78.316735,Poor,5.498251
1,2.0,WEST RIDGE,7.8,17.2,8.8,20.8,38.5,23040,46.0,WEST RIDGE,"[Sturnus vulgaris, Aquila chrysaetos, Passer d...",255,"[7, 1, 5, 4, 100, 2, 2, 2, 1, 1, 1, 25, 25, 25...",16.57033,23.04,15.388951,Poor,4.148305
2,3.0,UPTOWN,3.8,24.0,8.9,11.8,22.2,35787,20.0,UPTOWN,"[Corvus brachyrhynchos, Corvus brachyrhynchos,...",75132,"[5, 7, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",10.946406,35.787,6863.622752,Poor,9.626278
3,4.0,LINCOLN SQUARE,3.4,10.9,8.2,13.4,25.5,37524,17.0,LINCOLN SQUARE,"[Spinus tristis, Acanthis flammea, Junco hyema...",3751,"[1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",12.002108,37.524,312.528443,Poor,7.116321
4,5.0,NORTH CENTER,0.3,7.5,5.2,4.5,26.2,57123,6.0,NORTH CENTER,"[Corvus brachyrhynchos, Spinus tristis, Setoph...",1432,"[1, 7, 1, 28, 22, 4, 2, 5, 2, 1, 3, 2, 6, 2, 1...",9.588837,57.123,149.340322,Rich,6.540097


In [47]:
# json_data = final_df.to_json(orient='records', indent=4)
# # Write JSON data to a file and use that file for further development
# with open('data/final_data.json', 'w') as f:
#     f.write(json_data)