In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import geopandas as gpd
import statsmodels.formula.api as smf
from plotnine import *
pd.options.mode.chained_assignment = None

ghg = gpd.read_file("../data/ghg_processed.geojson")

In [68]:
colors =  ["#00a5bd", "#e6e6e6", "#f5d855", "#ff8e52", "#000000"]

In [85]:
ghg["bottom_quint"] = ghg["efficiency_quintile"] == '5'
ghg["top_quint"] = ghg["efficiency_quintile"] == '1'

In [70]:
ghg.columns

Index(['bbl', 'name', 'property id', 'total_units', 'eui', 'bedrooms',
       'housing_fa', 'total_fa', 'co2_intensity', 'total_co2', 'latitude',
       'longitude', 'year_built', 'occupancy', 'direct_emissions',
       'indirect_emissions', 'total_e', 'avg_apt_size', 'e_per_apt',
       'co2_tons_sq', 'co2_per_apt', 'co2_per_bedroom', 'exceeds_2024_limit',
       'exceeds_2030_limit', 'decade_built', 'bedrooms_per_apt',
       'sqft_per_bedroom', 'direct_emission_pct', 'efficiency_quintile',
       'unit_co2_quintile', 'bedroom_co2_quintile', 'limit_category',
       'apt_size_cat', 'sqft_bedroom_cat', 'state', 'county', 'tract',
       'total_population', 'pop_in_poverty', 'pop_black', 'pop_hispanic',
       'med_hh_inc', 'geoid', 'income_bin', 'income_q', 'total_energy',
       'total_emissions', 'district_steam_emissions_pct',
       'natural_gas_emissions_pct', 'electricity_emissions_pct',
       'fuel_oil_emissions_pct', 'fuel_oil_energy_pct',
       'district_steam_energy_pct', 

In [75]:
pred_cols = ['total_units',  'bedrooms', 'housing_fa',
       'total_fa', 'year_built', 'occupancy', 'avg_apt_size',
       'pop_in_poverty', 'pop_black',
       'pop_hispanic', 'med_hh_inc',
       'district_steam_emissions_pct', 'natural_gas_emissions_pct',
       'electricity_emissions_pct',  'value_per_apt', "value_per_sqft", 
       'ever_altered', 'altered_twice_or_more', "electricity_energy_pct", "fuel_oil_energy_pct", "natural_gas_energy_pct"]

In [91]:
mod_dat = ghg[pred_cols + ["bottom_quint"]].dropna()

In [92]:
X = mod_dat[pred_cols]
y = mod_dat["bottom_quint"]

In [93]:
y.max()

True

In [94]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Initialize XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the classifier to the training set
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate Precision and Recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {precision * 100.0}%")
print(f"Recall: {recall * 100.0}%")

# Feature Importance
importance = model.feature_importances_
print("Feature Importances:", importance)

Precision: 72.13822894168467%
Recall: 52.106084243369736%
Feature Importances: [0.02961544 0.03014297 0.0339556  0.03498751 0.03554274 0.02732034
 0.05836203 0.03022451 0.02914496 0.03667387 0.02985216 0.03600077
 0.02805234 0.08681616 0.0269373  0.03128026 0.02319843 0.0234968
 0.05173661 0.26097012 0.05568908]


In [95]:
import pandas as pd

# Assuming 'model' is your trained XGBoost model and 'X_train' is your feature DataFrame

# Get feature importances and map them to the corresponding column names
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index=X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

                              importance
fuel_oil_energy_pct             0.260970
electricity_emissions_pct       0.086816
avg_apt_size                    0.058362
natural_gas_energy_pct          0.055689
electricity_energy_pct          0.051737
pop_hispanic                    0.036674
district_steam_emissions_pct    0.036001
year_built                      0.035543
total_fa                        0.034988
housing_fa                      0.033956
value_per_sqft                  0.031280
pop_in_poverty                  0.030225
bedrooms                        0.030143
med_hh_inc                      0.029852
total_units                     0.029615
pop_black                       0.029145
natural_gas_emissions_pct       0.028052
occupancy                       0.027320
value_per_apt                   0.026937
altered_twice_or_more           0.023497
ever_altered                    0.023198


In [63]:
ghg.query("electricity_pct > .9")

Unnamed: 0,bbl,name,property id,total_units,eui,bedrooms,housing_fa,total_fa,co2_intensity,total_co2,...,fuel_oil_pct,yearalter1,yearalter2,assesstot,ever_altered,altered_twice_or_more,value_per_apt,value_per_sqft,geometry,bottom_quint
82,3024260001,Forest City - 281 Union Ave,6770887,33.0,38.2,45.0,30893.0,30893.0,3.2,99.7,...,0.0,2011.0,2012.0,3204900.0,True,True,1.618636e+06,1729.032467,POINT (-73.95076 40.70871),False
185,2028160066,1305 Morris Avenue,26598719,73.0,38.1,73.0,39181.0,39181.0,3.0,119.3,...,0.0,0.0,0.0,1735650.0,False,False,3.962671e+05,738.304280,POINT (-73.91252 40.83598),False
264,1020750008,527 West 143rd Street,6354533,24.0,15.5,58.0,29520.0,29520.0,1.3,37.0,...,0.0,1988.0,0.0,1015200.0,True,False,7.050000e+05,573.170732,POINT (-73.95022 40.82465),False
265,1020760001,565 West 144th Street,6696499,36.0,15.5,129.0,45675.0,45675.0,1.3,57.6,...,0.0,2019.0,0.0,1550250.0,True,False,7.177083e+05,565.681445,POINT (-73.94976 40.82527),False
270,1020890029,601 West 142nd Street,13458489,92.0,32.3,92.0,25091.0,25091.0,2.6,65.9,...,0.0,0.0,0.0,2281050.0,False,False,4.132337e+05,1515.184728,POINT (-73.95220 40.82467),False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16490,3015097501,126-134 Saratoga Avenue AKA 381-417 Chauncey S...,25178339,52.0,13.9,82.0,58292.0,58292.0,1.2,68.4,...,0.0,0.0,0.0,1675713.0,False,False,5.370875e+05,479.114630,POINT (-73.91711 40.68245),False
16512,3056250008,914 47th LLC,6826892,40.0,34.9,52.0,35600.0,35600.0,2.7,94.5,...,0.0,0.0,0.0,873000.0,False,False,3.637500e+05,408.707865,POINT (-73.99921 40.64113),False
16513,3056460006,5001 10th LLC,6826893,46.0,8.7,59.0,32400.0,32400.0,0.7,23.9,...,0.0,0.0,0.0,886500.0,False,False,3.211957e+05,456.018519,POINT (-73.99938 40.63837),False
16515,3056250005,902 47th LLC,6826896,42.0,62.8,50.0,30240.0,30240.0,4.7,143.6,...,0.0,0.0,0.0,722250.0,False,False,2.866071e+05,398.065476,POINT (-73.99940 40.64124),False


In [64]:
ghg["exclusive_elec"] = ghg["electricity_pct"] > .9

In [65]:
ghg.groupby("exclusive_elec")["year_built"].mean()

exclusive_elec
False    1947.913343
True     1962.089214
Name: year_built, dtype: float64