In [33]:
# Import all Libraries and set path varaibles 
import numpy as np
import pandas as pd
from restaurant_index import restaurant_index
from crime_index import crime_index
from transit_index import transit_index
from housing_index import housing_index
from health_index import health_index
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler

#USER SET VARIABLES 
path="team104final/Data/"
test_files="Test_files/"
RAW_RUN=False


RAW RUN 

In [34]:
## This section will run with RAW data and calulate each individual index. 

if RAW_RUN==True:
    restaurant_index_df=restaurant_index(path+"DOHMH_New_York_City_Restaurant_Inspection_Results_20231025.csv")
    restaurant_index_df.to_csv(test_files+'restaurant_index_df.csv',index=False)

    crime_index_df=crime_index(path+"Crime_Map_.csv")
    crime_index_df.to_csv(test_files+'crime_index_df.csv',index=False)

    transit_index_df=transit_index(subway_data=path+'MTA_NYCT_Subway_Entrances_and_Exits__2015_20231113.csv', bus_data=path+'Bus_Stop_Shelter.csv', bike_data=path+'BicycleParking.csv')
    transit_index_df.to_csv(test_files+'transit_index_df.csv')

    health_index_df=health_index(water_inspection_data=path+"Water_inspection_data.csv",Rodent_Inspection_data=path+"Rodent_Inspection_data.csv",Influenza_Pneumonia_data=path+"Influenza_Pneumonia_data.csv",EMS_Incident_data=path+"EMS_Incident_data.csv")
    health_index_df.to_csv(test_files+"health_index_df.csv",index=False)


    housing_index_df=housing_index(projects_data=path+'projects_data.csv', buildings_data=path+"buildings_data.csv", rent_data=path+'rent_data.csv', violations_data=path+"violations_data.csv")
    housing_index_df.to_csv(test_files+"housing_index_df.csv",index=False)




    ## TRANSIT WITH CENSUS_GEOID

    # transit_index_df=transit_index(subway_data="Test_files/subway_entrances.csv", bus_data="Test_files/bus_stops.csv", bike_data="Test_files/bike_parking.csv")
    ############




Pre-Processed Run 

In [35]:
##This will run using the pre-processed data saved inside "test_files"

restaurant_index_df=pd.read_csv(test_files+'restaurant_index_df.csv')
restaurant_index_df['restaurant_index']=restaurant_index_df['linear_equation']
restaurant_index_df=restaurant_index_df[['census_tract_geoid','restaurant_index']]


crime_index_df=pd.read_csv(test_files+"crime_index_df.csv")
crime_index_df['crime_index']=crime_index_df['StandardizedScore']
crime_index_df=crime_index_df[['census_tract_geoid','crime_index']]

transit_index_df=pd.read_csv(test_files+"transit_index_df.csv")
transit_index_df['transit_index']=transit_index_df['index_score']
transit_index_df=transit_index_df[['census_tract_geoid','transit_index']]

health_index_df=pd.read_csv(test_files+"health_index_df.csv")
health_index_df[['health_index']]=health_index_df[['Health_Index']]
health_index_df=health_index_df[['census_tract_geoid','health_index']]

housing_index_df=pd.read_csv(test_files+"housing_index_df.csv")
housing_index_df[['housing_index']]=housing_index_df[['index_score']]
housing_index_df=housing_index_df[['census_tract_geoid','housing_index']]


Joins

In [36]:
# This joins all indexes together
join1=restaurant_index_df.merge(crime_index_df,on="census_tract_geoid",how='outer')
join2=join1.merge(transit_index_df,on='census_tract_geoid',how='outer')
join3=join2.merge(health_index_df,on='census_tract_geoid',how='outer')
join4=join3.merge(housing_index_df,on='census_tract_geoid',how='outer')

join4.isnull().sum(axis = 0)

census_tract_geoid       0
restaurant_index       360
crime_index              2
transit_index          302
health_index          2031
housing_index           73
dtype: int64

In [37]:
#This removes rows with >3 missing indexes 
row_keep=[]
for i in range(len(join4)):
    if join4.iloc[i].isna().sum()<3:
        row_keep.append(i)
rows_underThree_na=join4.iloc[row_keep]
rows_underThree_na.isnull().sum(axis = 0)


census_tract_geoid       0
restaurant_index       190
crime_index              0
transit_index          142
health_index          1852
housing_index           20
dtype: int64

In [38]:
#This imputes missing data for missing indexes and outputs to csv
imp = IterativeImputer(max_iter=10, random_state=0)
final_noNa=rows_underThree_na.copy()
missing_data=['restaurant_index','health_index','transit_index','crime_index','housing_index']
final_noNa[missing_data] = imp.fit_transform(final_noNa[missing_data])
final_index=final_noNa.copy()
final_index.loc[final_index["restaurant_index"] < 0, "restaurant_index"] = 0
final_index.loc[final_index["health_index"] < 0, "health_index"] = 0
final_index.loc[final_index["transit_index"] < 0, "transit_index"] = 0
final_index.loc[final_index["crime_index"] < 0, "crime_index"] = 0
final_index.loc[final_index["housing_index"] < 0, "housing_index"] = 0
final_index.to_csv(test_files+'Final/final_index.csv',index=False)
print(final_index.describe())
print(restaurant_index_df.mean())
print(health_index_df.mean())
print(crime_index_df.mean())
print(transit_index_df.mean())
print(housing_index_df.mean())



       census_tract_geoid  restaurant_index  crime_index  transit_index  \
count        2.147000e+03       2147.000000  2147.000000    2147.000000   
mean         3.605447e+10          0.513076     0.082289       0.079448   
std          2.602874e+07          0.070383     0.074766       0.088721   
min          3.600500e+10          0.000000     0.001273       0.000000   
25%          3.604702e+10          0.484677     0.033744       0.024336   
50%          3.604712e+10          0.513406     0.059635       0.053097   
75%          3.608103e+10          0.540808     0.106537       0.101770   
max          3.608503e+10          1.000000     1.000000       1.000000   

       health_index  housing_index  
count   2147.000000    2147.000000  
mean       0.057052       0.153622  
std        0.041350       0.149094  
min        0.000000       0.000000  
25%        0.054743       0.045284  
50%        0.055847       0.105861  
75%        0.057743       0.214713  
max        1.000000       1.

NULL evaluation 

In [39]:
null_evaluation_run=True

if null_evaluation_run==True:  

    #Runs 10 times 
    mse_loop_list=[]
    mse_loop=0
    while mse_loop<10:
        restaurant_null_perc=190/2147
        transit_null_perc=142/2147
        health_null_perc=1852/2147
        housing_null_perc=20/2147
        # Inital split on full index for __ %
        final_index_missing=final_index.sample(frac = 1).reset_index()

        # Add missing values to restaurant
        restaurant_missing_eval=final_index_missing.sample(frac = restaurant_null_perc)
        missing_restaurant=[]
        for i in restaurant_missing_eval.index:
            missing_restaurant.append(i)
        for k in final_index_missing.index:
            if k in missing_restaurant:
                final_index_missing.loc[k,'restaurant_index_missing']=np.NaN
            else: 
                final_index_missing.loc[k,'restaurant_index_missing']=final_index_missing.loc[k,'restaurant_index']

        # Add missing values to transit
        transit_missing_eval=final_index_missing.sample(frac = transit_null_perc)
        missing_transit=[]
        for i in transit_missing_eval.index:
            missing_transit.append(i)
        for k in final_index_missing.index:
            if k in missing_transit:
                final_index_missing.loc[k,'transit_index_missing']=np.NaN
            else: 
                final_index_missing.loc[k,'transit_index_missing']=final_index_missing.loc[k,'transit_index']


        # Add missing values to health
        health_missing_eval=final_index_missing.sample(frac = health_null_perc)
        missing_health=[]
        for i in health_missing_eval.index:
            missing_health.append(i)
        for k in final_index_missing.index:
            if k in missing_health:
                final_index_missing.loc[k,'health_index_missing']=np.NaN
            else: 
                final_index_missing.loc[k,'health_index_missing']=final_index_missing.loc[k,'health_index']


        # Add missing values to housing
        housing_missing_eval=final_index_missing.sample(frac = housing_null_perc)
        missing_housing=[]
        for i in housing_missing_eval.index:
            missing_housing.append(i)
        for k in final_index_missing.index:
            if k in missing_housing:
                final_index_missing.loc[k,'housing_index_missing']=np.NaN
            else: 
                final_index_missing.loc[k,'housing_index_missing']=final_index_missing.loc[k,'housing_index']
        
        # GRAB ONLY NULL ROWS
        null_rows = final_index_missing[final_index_missing.isnull().any(axis=1)]
        max_rows=len(null_rows)
            




        # Using IterativeImputer fill in missing data that was just randomly generated
        imp = IterativeImputer(max_iter=10, random_state=0)
        missing_data_eval=['restaurant_index_missing','health_index_missing','transit_index_missing','housing_index_missing']
        final_index_missing[missing_data_eval] = imp.fit_transform(final_index_missing[missing_data_eval])

        # CALC MSE 
        final_index_missing['restaurant_mse']=(final_index_missing['restaurant_index_missing']-final_index_missing['restaurant_index'])**2
        final_index_missing['transit_mse']=(final_index_missing['transit_index_missing']-final_index_missing['transit_index'])**2
        final_index_missing['health_mse']=(final_index_missing['health_index_missing']-final_index_missing['health_index'])**2
        final_index_missing['housing_mse']=(final_index_missing['housing_index_missing']-final_index_missing['housing_index'])**2
        pd.set_option('display.max_rows', None)
        mse=(sum(final_index_missing['restaurant_mse'])+sum(final_index_missing['transit_mse'])+sum(final_index_missing['health_mse'])+sum(final_index_missing['housing_mse']))/max_rows
        mse_loop_list.append(mse)
        mse_loop+=1
    mse_average=np.average(mse_loop_list)
    print("MSE: "+str(mse_average))

