In [131]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pickle
import typing

In [132]:
data = pd.read_csv('../data/Median_Incomes.csv')
data.head()


Unnamed: 0,Location,Household Type,TimeFrame,DataFormat,Data,Fips
0,Riverdale,All Households,2005,Dollars,66429.87643,208
1,Williamsbridge,All Households,2005,Dollars,54603.07992,212
2,Throgs Neck,All Households,2005,Dollars,60436.77167,210
3,Pelham Parkway,All Households,2005,Dollars,55273.71398,211
4,Morrisania,All Households,2005,Dollars,25750.6987,203


In [133]:
data['Household Type'].unique()

array(['All Households', 'Families', 'Families with Children',
       'Families without Children'], dtype=object)

In [134]:
print(data.shape)

(4160, 6)


In [135]:
data = data.dropna().copy()
data.shape

(4160, 6)

In [136]:
data['DataFormat'].unique()

array(['Dollars'], dtype=object)

In [137]:
df = data[['Household Type', 'Location', 'Data', 'TimeFrame']].copy()
df[['Household Type','Location']].astype('string')
df['Data'] = df['Data'].str.strip()
df['Data'] = pd.to_numeric(df['Data'], errors='coerce') 
df['Data'] = np.ceil(df['Data'])
df['Data'] = df['Data'].fillna(0).astype(int)
df.head()

Unnamed: 0,Household Type,Location,Data,TimeFrame
0,All Households,Riverdale,66430,2005
1,All Households,Williamsbridge,54604,2005
2,All Households,Throgs Neck,60437,2005
3,All Households,Pelham Parkway,55274,2005
4,All Households,Morrisania,25751,2005


In [138]:
df.rename(columns={"Data": "Yearly_income", "TimeFrame": "Date"}, inplace=True)
df.head()

Unnamed: 0,Household Type,Location,Yearly_income,Date
0,All Households,Riverdale,66430,2005
1,All Households,Williamsbridge,54604,2005
2,All Households,Throgs Neck,60437,2005
3,All Households,Pelham Parkway,55274,2005
4,All Households,Morrisania,25751,2005


In [139]:
df['Date'].unique()

array([2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2021], dtype=int64)

In [140]:
filtered_df = df[df['Date'] > 2012]

# Calculate the average income for each combination of Household Type and Location
average_income_by_group = (
    filtered_df.groupby(['Household Type', 'Location'])['Yearly_income']
    .mean()
    .reset_index()
    .rename(columns={'Yearly_income': 'Avg_Yearly_Income_Above_2012'})
)

# Merge the calculated averages back to the original DataFrame
df = df.merge(average_income_by_group, on=['Household Type', 'Location'], how='left')

# Display the updated DataFrame
df.head()

Unnamed: 0,Household Type,Location,Yearly_income,Date,Avg_Yearly_Income_Above_2012
0,All Households,Riverdale,66430,2005,63444.125
1,All Households,Williamsbridge,54604,2005,57027.125
2,All Households,Throgs Neck,60437,2005,61644.875
3,All Households,Pelham Parkway,55274,2005,54912.375
4,All Households,Morrisania,25751,2005,25512.625


In [141]:
df.shape

(4160, 5)

In [142]:
df.drop(columns=['Date'], inplace=True)
df.drop(columns=['Yearly_income'], inplace=True)
df.drop(columns=['Household Type'], inplace=True)
df = df.drop_duplicates(subset=['Location'], keep='first')

df.head()


Unnamed: 0,Location,Avg_Yearly_Income_Above_2012
0,Riverdale,63444.125
1,Williamsbridge,57027.125
2,Throgs Neck,61644.875
3,Pelham Parkway,54912.375
4,Morrisania,25512.625


In [143]:
df.head()

Unnamed: 0,Location,Avg_Yearly_Income_Above_2012
0,Riverdale,63444.125
1,Williamsbridge,57027.125
2,Throgs Neck,61644.875
3,Pelham Parkway,54912.375
4,Morrisania,25512.625


In [145]:

df.to_csv('../exports/Income.csv', index=False)
