## Data Cleaning
### 1. Clean housing data, school grduation rate data, and demographic data 

In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from pathlib import Path

In [None]:
# Clean housing data
# Read data
df = pd.read_csv("raw_data/neighborhood_market_tracker.tsv000" ,sep='\t', usecols=['period_end', 'region',
       'city', 'state', 'state_code', 'median_sale_price', 'property_type'])
# Filter data
df_il = df[(df["state_code"] == "IL") & (df["city"] == "Chicago") & (df["property_type"] == 'All Residential')]

# Get only community area names
df_il['region'] = df_il['region'].str.replace('Chicago, IL - ', '')
df_il.rename(columns={'region': 'name'}, inplace=True)

df_il['name'] = df_il['name'].str.replace('The Loop', 'Loop')
df_il['name'] = df_il['name'].str.title()
df_il['name'] = df_il['name'].str.replace('East Riverdale', 'Riverdale')
df_il['name'] = df_il['name'].str.replace('West Rogers Park', 'Rogers Park')
df_il['name'] = df_il['name'].str.replace('Belmont Gardens', 'Belmont Cragin')

# Extract year and month
df_il['period_end'] = pd.to_datetime(df_il['period_end'], format='%Y-%m-%d')
df_il['year'] = df_il['period_end'].dt.year
df_il['month'] = df_il['period_end'].dt.month

# Keep variables needed
df_il = df_il[['year', 'month', 'name', 'median_sale_price']]

# Convert to yearly data
df_il.sort_values(by=['name','year','month'], ascending=True)
df_yearly = df_il.groupby(['year', 'name'], as_index=False)['median_sale_price'].median()

# Save the data for 2022
df_2022 = df_yearly[(df_yearly["year"] == 2022)]
df_2022.to_csv('cleaned_data/housing_price_2022.csv', index=False)

In [None]:
# Clean population data
df_pop = pd.read_csv("raw_data/population.csv")
df_pop['name'] = df_pop['name'].str.title()
df_pop.to_csv('cleaned_data/population_lower.csv', index=False)

In [None]:
# Clean School data
# Read files
df_school = pd.read_csv("raw_data/school_community_area.csv")
df_grad = pd.read_csv("raw_data/graduation_raw.csv")

# Clean for merging
df_school.rename(columns={'Community Area': 'name'}, inplace=True)
df_school['name'] = df_school['name'].str.title()

# Clean, change the data type, and calculate graduation rate
df_grad['graduates'] = df_grad['graduates'].str.strip()
df_grad['graduates'] = pd.to_numeric(df_grad['graduates'], errors='coerce') 
df_grad['number of students'] = df_grad['number of students'].str.strip()
df_grad['number of students'] = pd.to_numeric(df_grad['number of students'], errors='coerce')
df_grad.dropna(inplace=True)

df_merge = df_grad.merge(df_school, how='left', on='School ID')
df_merge.isnull().sum()

In [None]:
missing_name_rows = df_merge[df_merge['name'].isnull()]
print(missing_name_rows) # Closed in 2023
# Drop missing
df_merge.dropna(inplace=True)

# Calculate graduation rate
df_group = df_merge.groupby(['name'], as_index=False)[['graduates', 'number of students']].sum()
df_group['graduation_rate'] = df_group['graduates'] / df_group['number of students'] * 100
df_group = df_group[['name', 'graduation_rate']]
df_group.to_csv('cleaned_data/cleaned_graduation.csv', index=False)

### 2. Merge all the data and turn it to geojson file 

In [None]:
# Read data
df_house = pd.read_csv("cleaned_data/housing_price_2022.csv", usecols=['name', 'median_sale_price'])
df_graduation = pd.read_csv("cleaned_data/cleaned_graduation.csv")
df_pop = pd.read_csv("cleaned_data/population_lower.csv", usecols=['name', 'population'])

In [None]:
df_graduation['graduation_rate'].describe()

In [None]:
df_house['median_sale_price'].describe()

In [None]:
# Load the CSV
df = pd.read_csv("raw_data/CommAreas_20241126.csv")

# Convert the WKT geometries to shapely objects
df['geometry'] = df['the_geom'].apply(wkt.loads)

# Convert to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Select only necessary columns for GeoJSON output
gdf = gdf[['COMMUNITY', 'geometry']]

# Rename the columns to fit GeoJSON conventions
gdf.rename(columns={'COMMUNITY': 'name'}, inplace=True)

# Change the 'name' column to title case
gdf['name'] = gdf['name'].str.title()

# Merge information
gdf = gdf.merge(df_house, on="name", how="left")
gdf = gdf.merge(df_graduation, on="name", how="left")
gdf = gdf.merge(df_pop, on="name", how="left")

# Save as GeoJSON
gdf.to_file("cleaned_data/community_areas.geojson", driver="GeoJSON")