In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
df = pd.read_csv("data/deepsolar_tract1.csv", encoding="utf-8")

In [29]:
# Number of solar systems per thousand households
df = df[df.state == 'ca']
df["solar_systems_per_thousand_households"] = (df["solar_system_count"] / df["household_count"]) * 1000

In [30]:
df = df.drop(['county', 'state', 'heating_fuel_coal_coke', 'heating_fuel_electricity', 'heating_fuel_fuel_oil_kerosene',
                  'heating_fuel_gas', 'heating_fuel_housing_unit_count', 'heating_fuel_none', 'heating_fuel_other', 'heating_fuel_solar',
                  'land_area', 'per_capita_income', 'population', 'state', 'total_area', 'water_area', 
                  'heating_fuel_gas_rate', 'heating_fuel_electricity_rate', 'heating_fuel_fuel_oil_kerosene_rate', 'heating_fuel_coal_coke_rate',
                  'heating_fuel_solar_rate', 'heating_fuel_other_rate', 'heating_fuel_none_rate', 'solar_panel_area_per_capita', 'median_household_income',
                 'electricity_price_industrial', 'electricity_price_transportation', 'electricity_price_overall', 'electricity_consume_commercial',
                  'electricity_consume_industrial', 'electricity_consume_total', 'household_count', 'average_household_size',
                  'housing_unit_count', 'housing_unit_occupied_count', 'housing_unit_median_value', 'lat', 'lon', 'elevation',
                  'heating_design_temperature', 'cooling_design_temperature', 'earth_temperature_amplitude', 'frost_days', 'air_temperature',
                  'relative_humidity', 'daily_solar_radiation', 'atmospheric_pressure', 'wind_speed', 'earth_temperature', 'heating_degree_days',
                  'cooling_degree_days', 
              'household_type_family_rate',
                  'dropout_16_19_inschool_rate', 'occupation_construction_rate', 'occupation_public_rate', 'occupation_information_rate',
                  'occupation_finance_rate', 'occupation_education_rate', 'occupation_administrative_rate', 'occupation_manufacturing_rate',
                  'occupation_wholesale_rate', 'occupation_retail_rate', 'occupation_transportation_rate', 'occupation_arts_rate', 'occupation_agriculture_rate',
                 'occupancy_vacant_rate', 'occupancy_owner_rate', 'mortgage_with_rate', 'transportation_home_rate', 'transportation_car_alone_rate',
                  'transportation_walk_rate', 'transportation_carpool_rate', 'transportation_motorcycle_rate', 'transportation_bicycle_rate',
                  'transportation_public_rate', 'travel_time_less_than_10_rate', 'travel_time_10_19_rate', 'travel_time_20_29_rate',
                  'travel_time_30_39_rate', 'travel_time_40_59_rate', 'travel_time_60_89_rate', 'health_insurance_public_rate', 'health_insurance_none_rate',
                  'travel_time_average', 'electricity_price_residential', 'electricity_price_commercial', 'electricity_consume_residential',
                 'incentive_count_residential', 'incentive_count_nonresidential', 'incentive_residential_state_level', 'net_metering', 'feedin_tariff',
                  'cooperate_tax', 'property_tax', 'sales_tax', 'rebate', 'avg_electricity_retail_rate', 'incentive_nonresidential_state_level'], axis=1)

df = df.dropna()

In [31]:
def median_dataframe(binned_column, original_column, comparison_column='solar_systems_per_thousand_households'):
    df[binned_column] = pd.qcut(df[original_column], q=64, duplicates='drop')
    x = list(df.groupby(binned_column).median()[comparison_column].values)
    y = list(df.groupby(binned_column).median()[original_column].values)
    return pd.DataFrame({comparison_column: x, original_column: y})


hi = median_dataframe("average_household_income_binned", 'average_household_income')
hi.max()

solar_systems_per_thousand_households        92.811855
average_household_income                 250385.567519
dtype: float64

In [32]:
import plotly.express as px
# Plot scatter data
def plot_scatter(df, x, y, color, image_name, ):
    fig = px.scatter(df, x=x, y=y, trendline="ols", trendline_color_override=color)
    fig.update_traces(marker=dict(color=color))
    fig.show()
    model = px.get_trendline_results(fig)
    results = model.iloc[0]["px_fit_results"]
    alpha = results.params[0]
    beta = results.params[1]
    p_beta = results.pvalues[1]
    r_squared = results.rsquared
    print("results: ", results)
    print("alpha: ", alpha)
    print("beta: ", beta)
    print("p_beta: ", p_beta)
    print("r_squared: ", r_squared)
    fig.write_image("figures/" + image_name + ".png", engine='kaleido')

plot_scatter(hi, 'average_household_income', 'solar_systems_per_thousand_households', 'CadetBlue', 'avg_income')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcee119a0>
alpha:  0.4296052792821792
beta:  0.0004515892465246695
p_beta:  4.155311529009996e-25
r_squared:  0.8244798851654027


In [33]:
hi2 = median_dataframe("black_binned", 'race_black_africa_rate')
plot_scatter(hi2, 'race_black_africa_rate', 'solar_systems_per_thousand_households', 'CadetBlue', 'race_black')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcee2d2e0>
alpha:  37.034901578008515
beta:  -49.17669963260404
p_beta:  1.6776892924270114e-08
r_squared:  0.42502257979014446


In [34]:
hi3 = median_dataframe("asian_binned", 'race_asian_rate')
plot_scatter(hi3, 'race_asian_rate', 'solar_systems_per_thousand_households', 'CornflowerBlue', 'race_asian')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd0057310>
alpha:  33.78904879106044
beta:  5.046537855187798
p_beta:  0.4992439647338428
r_squared:  0.0075174729501067405


In [35]:
hi4 = median_dataframe("race_other_binned", 'race_other_rate')
plot_scatter(hi4, 'race_other_rate', 'solar_systems_per_thousand_households', 'DarkSalmon', 'race_other')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd00d9fa0>
alpha:  45.42047766585169
beta:  -73.40828055877509
p_beta:  5.1486203199744915e-20
r_squared:  0.7498718227144586


In [36]:
hi5 = median_dataframe("race_indian_alaska_binned", 'race_indian_alaska_rate')
plot_scatter(hi5, 'race_indian_alaska_rate', 'solar_systems_per_thousand_households', 'DarkSeaGreen', 'race_indian')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd004c9d0>
alpha:  37.685589558419395
beta:  -429.8142004664769
p_beta:  1.1528884574578681e-09
r_squared:  0.5901044481312866


In [37]:
hi6 = median_dataframe("race_islander_binned", 'race_islander_rate')
plot_scatter(hi6, 'race_islander_rate', 'solar_systems_per_thousand_households', 'Plum', 'race_islander')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd08bda90>
alpha:  34.487922763406075
beta:  -81.55279783436515
p_beta:  0.33362300990421057
r_squared:  0.04455820981266567


In [38]:
hi7 = median_dataframe("race_two_more_binned", 'race_two_more_rate')
plot_scatter(hi7, 'race_two_more_rate', 'solar_systems_per_thousand_households', 'MediumPurple', 'race_two')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd004c580>
alpha:  26.700467052573938
beta:  166.74832516806694
p_beta:  1.9281977252891268e-06
r_squared:  0.3081854098579836


In [39]:
hi19 = median_dataframe("white_binned", 'race_white_rate')
plot_scatter(hi19, 'race_white_rate', 'solar_systems_per_thousand_households', 'LightPink', 'race_white')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd014f6d0>
alpha:  10.90878352872907
beta:  39.50615693474255
p_beta:  5.9546394530474326e-12
r_squared:  0.5367110309917853


In [40]:
hi8 = median_dataframe("education_bachelor_binned", 'education_bachelor_rate')
plot_scatter(hi8, 'education_bachelor_rate', 'solar_systems_per_thousand_households', 'CadetBlue', 'education_bachelor')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf0bf610>
alpha:  22.147904683601382
beta:  66.31076656165465
p_beta:  8.036383086698469e-10
r_squared:  0.4585841601276093


In [41]:
hi9 = median_dataframe("education_college_binned", 'education_college_rate')
plot_scatter(hi9, 'education_college_rate', 'solar_systems_per_thousand_households', 'CornflowerBlue', 'education_college')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd014f5b0>
alpha:  1.8567629513238595
beta:  111.21414846857802
p_beta:  1.8357603441543493e-22
r_squared:  0.7865290298210671


In [42]:
hi10 = median_dataframe("education_doctoral_binned", 'education_doctoral_rate')
plot_scatter(hi10, 'education_doctoral_rate', 'solar_systems_per_thousand_households', 'DarkSalmon', 'education_doctoral')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf0b9160>
alpha:  32.36962455142793
beta:  277.5127869681047
p_beta:  1.5810321537029641e-09
r_squared:  0.5429161191943278


In [43]:
hi11 = median_dataframe("education_high_school_graduate_binned", 'education_high_school_graduate_rate')
plot_scatter(hi11, 'education_high_school_graduate_rate', 'solar_systems_per_thousand_households', 'DarkSeaGreen', 'education_high_school')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf6ce700>
alpha:  45.980823383958636
beta:  -57.246894066844206
p_beta:  1.5595975719997002e-07
r_squared:  0.360604744954901


In [44]:
hi12 = median_dataframe("education_less_than_high_school_binned", 'education_less_than_high_school_rate')
plot_scatter(hi12, 'education_less_than_high_school_rate', 'solar_systems_per_thousand_households', 'Plum', 'education_less_than_high_school')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd08a1220>
alpha:  52.435638707311085
beta:  -81.82979716049627
p_beta:  9.955647009513003e-25
r_squared:  0.8194794238058531


In [45]:
hi13 = median_dataframe("education_master_binned", 'education_master_rate')
plot_scatter(hi13, 'education_master_rate', 'solar_systems_per_thousand_households', 'LightPink', 'education_master')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf6d2370>
alpha:  25.95279577577827
beta:  131.4228848198146
p_beta:  8.824479789162321e-12
r_squared:  0.5366991269062469


In [46]:
hi14 = median_dataframe("education_professional_school_binned", 'education_professional_school_rate')
plot_scatter(hi14, 'education_professional_school_rate', 'solar_systems_per_thousand_households', 'MediumPurple', 'education_professional')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf6ceeb0>
alpha:  30.499934379381678
beta:  252.15717538134962
p_beta:  2.2984990046738364e-12
r_squared:  0.6011382085114942


In [47]:
hi15 = median_dataframe("age_18_24_rate_binned", 'age_18_24_rate')
plot_scatter(hi15, 'age_18_24_rate', 'solar_systems_per_thousand_households', 'CornflowerBlue', 'age_18_24')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf6fa070>
alpha:  42.351313379487195
beta:  -77.47958260357933
p_beta:  1.618619641962017e-05
r_squared:  0.26084241433696764


In [48]:
hi16 = median_dataframe('age_25_34_rate_binned', 'age_25_34_rate')
plot_scatter(hi16, 'age_25_34_rate', 'solar_systems_per_thousand_households', 'DarkSalmon', 'age_25_34')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf71b4f0>
alpha:  78.91612287784044
beta:  -280.3054654120645
p_beta:  7.746417321020885e-23
r_squared:  0.7923647853133677


In [49]:
hi17 = median_dataframe('age_35_44_rate_binned', 'age_35_44_rate')
plot_scatter(hi17, 'age_35_44_rate', 'solar_systems_per_thousand_households', 'DarkSeaGreen', 'age_35_44')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf0bf790>
alpha:  51.049966736735804
beta:  -127.74817329534352
p_beta:  8.458321042598606e-07
r_squared:  0.32578057344701106


In [50]:
hi18 = median_dataframe('age_45_54_rate_binned', 'age_45_54_rate')
plot_scatter(hi18, 'age_45_54_rate', 'solar_systems_per_thousand_households', 'Plum', 'age_45_54')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf72d280>
alpha:  -15.310698305891798
beta:  369.7626797052611
p_beta:  1.1339111602018395e-28
r_squared:  0.8652019113540197


In [51]:
hi20 = median_dataframe('age_55_64_rate_binned', 'age_55_64_rate')
plot_scatter(hi20, 'age_55_64_rate', 'solar_systems_per_thousand_households', 'LightPink', 'age_55_64')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabcf736970>
alpha:  11.046939725383293
beta:  198.60602465599027
p_beta:  1.3173763858838336e-09
r_squared:  0.45003923267410983


In [52]:
hi21 = median_dataframe('age_65_74_rate_binned', 'age_65_74_rate')
plot_scatter(hi21, 'age_65_74_rate', 'solar_systems_per_thousand_households', 'MediumPurple', 'age_64_74')

results:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fabd0160520>
alpha:  20.101129301072277
beta:  195.87287201152895
p_beta:  4.3831753830751244e-11
r_squared:  0.5063349621072406
