In [31]:
import pandas as pd
import re 
import geopandas as gpd
import pipeline_jmidkiff as pipeline
import numpy as np

import importlib
importlib.reload(pipeline)

pd.set_option('display.max_rows', 100)

# Block Group Data Preparation

In [32]:
initial_assessments = pd.read_csv(
    'data/Cook_County_Assessor_s_Residential_Property_Characteristics.zip', 
    sep='\t', compression='zip')
# https://datacatalog.cookcountyil.gov/Property-Taxation/Cook-County-Assessor-s-Residential-Property-Charac/bcnq-qi2z
pipeline.show(initial_assessments)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,



Shape:
(1995108, 82)

Data Types:
PIN                                                  int64
Property Class                                       int64
Tax Year                                             int64
Neighborhood Code                                    int64
Land Square Feet                                     int64
Town Code                                            int64
Type of Residence                                  float64
Apartments                                         float64
Wall Material                                      float64
Roof Material                                      float64
Rooms                                              float64
Bedrooms                                           float64
Basement                                           float64
Basement Finish                                    float64
Central Heating                                    float64
Other Heating                                      float64
Central Air          

Unnamed: 0,PIN,Property Class,Tax Year,Neighborhood Code,Land Square Feet,Town Code,Type of Residence,Apartments,Wall Material,Roof Material,...,Location Factor,Garage indicator,Residential share of building,Pure Market Sale,Pure Market Filter,Neigborhood Code (mapping),Square root of lot size,Square root of age,Square root of improvement size,Town and Neighborhood
0,1011000040000,212,2018,12,11055,10,2.0,5.0,1.0,2.0,...,2.084499,1.0,,0,0,12,105.14276,11.61895,43.634848,1012
1,1011000050000,205,2018,12,6534,10,2.0,0.0,1.0,1.0,...,2.079861,1.0,,0,0,12,80.833162,10.677078,39.522146,1012
2,1011000060000,212,2018,12,6534,10,2.0,3.0,2.0,1.0,...,2.076376,1.0,,0,0,12,80.833162,10.440307,40.95119,1012
3,1011000090000,212,2018,12,13571,10,2.0,2.0,2.0,2.0,...,2.061552,1.0,,0,0,12,116.494635,8.717798,67.416615,1012
4,1011000170000,205,2018,12,9027,10,2.0,0.0,1.0,1.0,...,2.096602,0.0,,0,0,12,95.010526,11.090537,39.799497,1012


In [33]:
col_list = ['Property Address', 'PIN', 'Longitude', 'Latitude', 
            'Property Class', 'Wall Material', 
            'Roof Material', 'Repair Condition', 'Renovation', 
            'Prior Tax Year Market Value Estimate (Land)', 
            'Prior Tax Year Market Value Estimate (Building)', 'Land Square Feet', 
            'Building Square Feet', 'Age']

for col in col_list: 
    print(pipeline.describe(initial_assessments[col]))
    print("-" * 30)

# Field Descriptions: 
# https://datacatalog.cookcountyil.gov/Property-Taxation/Cook-County-Assessor-s-Residential-Property-Charac/bcnq-qi2z

# Important features: 
# Percentages of those in block group: distinct Property Class, 
# distinct wall material, distinct roof material, Repair Condition, Renovation

# Counts: 
# Total units in block group. 

# Medians & Means: 
# Prior Year Market Value Estimate (Land), & Building, Land Square Feet, 
# Building Square Feet
# Total Building Square Feet applies to condominiums (Property Class 299) only

pipeline.group_count(initial_assessments, 'Age').sort_values(ascending=False)
# Note that the Assessment data imputes '10' for any houses missing their age. 

count                  1991916
unique                 1452076
top       655 W IRVING PARK RD
freq                      1679
Name: Property Address, dtype: object
------------------------------
count    1.995108e+06
mean     1.433242e+13
std      7.911198e+12
min      1.011000e+12
25%      8.144010e+12
50%      1.405211e+13
75%      1.915217e+13
max      3.332302e+13
Name: PIN, dtype: float64
------------------------------
count    1.991085e+06
mean    -8.779745e+01
std      1.581003e-01
min     -8.826351e+01
25%     -8.788149e+01
50%     -8.776527e+01
75%     -8.767502e+01
max     -8.752481e+01
Name: Longitude, dtype: float64
------------------------------
count    1.991085e+06
mean     4.190541e+01
std      1.649300e-01
min      4.146983e+01
25%      4.179062e+01
50%      4.193405e+01
75%      4.203778e+01
max      4.215398e+01
Name: Latitude, dtype: float64
------------------------------
count    1.995108e+06
mean     2.428938e+02
std      4.303183e+01
min      2.000000e+02
25%      

Age
10     60807
62     43847
61     39770
63     39611
64     37510
       ...  
174        1
200        1
203        1
205        1
208        1
Name: Count, Length: 188, dtype: int64

In [34]:
assessments = initial_assessments[col_list]
assessments = gpd.GeoDataFrame(
    assessments, 
    geometry=gpd.points_from_xy(
        assessments['Longitude'], 
        assessments['Latitude']), 
    crs='EPSG:4326')
pipeline.show(assessments)


Shape:
(1995108, 15)

Data Types:
Property Address                                     object
PIN                                                   int64
Longitude                                           float64
Latitude                                            float64
Property Class                                        int64
Wall Material                                       float64
Roof Material                                       float64
Repair Condition                                    float64
Renovation                                          float64
Prior Tax Year Market Value Estimate (Land)         float64
Prior Tax Year Market Value Estimate (Building)     float64
Land Square Feet                                      int64
Building Square Feet                                float64
Age                                                   int64
geometry                                           geometry
dtype: object


Unnamed: 0,Property Address,PIN,Longitude,Latitude,Property Class,Wall Material,Roof Material,Repair Condition,Renovation,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Building Square Feet,Age,geometry
0,213 W MAIN ST BARRINGTON,1011000040000,-88.139621,42.153953,212,1.0,2.0,2.0,,52510.0,374930.0,11055,1904.0,135,POINT (-88.13962 42.15395)
1,209 W MAIN ST BARRINGTON,1011000050000,-88.139379,42.153953,205,1.0,1.0,2.0,,31030.0,173970.0,6534,1562.0,114,POINT (-88.13938 42.15395)
2,205 W MAIN ST BARRINGTON,1011000060000,-88.139196,42.153953,212,2.0,1.0,2.0,,31030.0,353260.0,6534,1677.0,109,POINT (-88.13920 42.15395)
3,149 W MAIN ST BARRINGTON,1011000090000,-88.138427,42.153945,212,2.0,2.0,2.0,,64460.0,668790.0,13571,4545.0,76,POINT (-88.13843 42.15395)
4,115 DUNDEE AVE BARRINGTON,1011000170000,-88.140521,42.153671,205,1.0,1.0,3.0,,42870.0,291310.0,9027,1584.0,123,POINT (-88.14052 42.15367)


In [35]:
acs_g = gpd.read_file('data/geo_census.shp')
pipeline.show(acs_g)


Shape:
(2194, 19)

Data Types:
FIPS           float64
Geographic      object
Census Tra       int64
Block Grou       int64
tot_pop          int64
tot_white        int64
tot_black        int64
hh_size        float64
med_income     float64
occ_units        int64
med_built_     float64
med_rent       float64
oo_hsng_un       int64
GEOID           object
perc_white     float64
perc_non_w     float64
perc_black     float64
perc_owner     float64
geometry      geometry
dtype: object


Unnamed: 0,FIPS,Geographic,Census Tra,Block Grou,tot_pop,tot_white,tot_black,hh_size,med_income,occ_units,med_built_,med_rent,oo_hsng_un,GEOID,perc_white,perc_non_w,perc_black,perc_owner,geometry
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,,873.0,117,170310101001,0.574837,0.425163,0.234273,0.495763,"POLYGON ((-87.67009 42.02115, -87.67047 42.021..."
1,170000000000.0,15000US170310101002,10100,2,2424,746,1543,2.26,21827.0,1054,,799.0,53,170310101002,0.307756,0.692244,0.636551,0.050285,"POLYGON ((-87.66950 42.01936, -87.66963 42.019..."
2,170000000000.0,15000US170310101003,10100,3,1714,1137,427,1.5,54297.0,1073,,1071.0,327,170310101003,0.663361,0.336639,0.249125,0.304753,"POLYGON ((-87.66681 42.01924, -87.66780 42.019..."
3,170000000000.0,15000US170310102011,10201,1,1706,479,745,2.3,42778.0,712,1962.0,1097.0,224,170310102011,0.280774,0.719226,0.436694,0.314607,"POLYGON ((-87.68234 42.01250, -87.68268 42.012..."
4,170000000000.0,15000US170310102012,10201,2,3925,2131,1180,2.69,39535.0,1424,1945.0,1152.0,353,170310102012,0.54293,0.45707,0.300637,0.247893,"POLYGON ((-87.67972 42.01392, -87.68003 42.013..."


In [36]:
assessments_in_chi = gpd.sjoin(
    left_df=acs_g.loc[:,['GEOID', 'geometry']], 
    right_df=assessments, how='inner', op='intersects')
pipeline.show(assessments_in_chi)


Shape:
(728543, 17)

Data Types:
GEOID                                                object
geometry                                           geometry
index_right                                           int64
Property Address                                     object
PIN                                                   int64
Longitude                                           float64
Latitude                                            float64
Property Class                                        int64
Wall Material                                       float64
Roof Material                                       float64
Repair Condition                                    float64
Renovation                                          float64
Prior Tax Year Market Value Estimate (Land)         float64
Prior Tax Year Market Value Estimate (Building)     float64
Land Square Feet                                      int64
Building Square Feet                                float64
Age   

Unnamed: 0,GEOID,geometry,index_right,Property Address,PIN,Longitude,Latitude,Property Class,Wall Material,Roof Material,Repair Condition,Renovation,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Building Square Feet,Age
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799426,7550 N SHERIDAN RD,11291010331123,-87.666674,42.022737,299,,,,,1700.0,22190.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799427,7550 N SHERIDAN RD,11291010331101,-87.666674,42.022737,299,,,,,21780.0,283620.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799356,7550 N SHERIDAN RD,11291010331130,-87.666674,42.022737,299,,,,,940.0,12320.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799428,7550 N SHERIDAN RD,11291010331118,-87.666674,42.022737,299,,,,,1180.0,15410.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799357,7550 N SHERIDAN RD,11291010331103,-87.666674,42.022737,299,,,,,24140.0,314380.0,61111,,15


In [37]:
assessments_in_chi[assessments_in_chi['Building Square Feet'].isna()].groupby('Property Class')['Property Class'].count()

Property Class
200        37
201      4115
203         1
206         1
208         1
209         2
211         3
212        19
241      8141
278         1
299    277880
Name: Property Class, dtype: int64

In [38]:
cols_to_dummify = [
    'Property Class', 'Wall Material', 'Roof Material', 'Repair Condition', 
    'Renovation']
assessments_wide = (pd.get_dummies(
        data=assessments_in_chi, columns=cols_to_dummify, dummy_na=True)
    .drop(columns=['Property Address']))
pipeline.show(assessments_wide)


Shape:
(728543, 49)

Data Types:
GEOID                                                object
geometry                                           geometry
index_right                                           int64
PIN                                                   int64
Longitude                                           float64
Latitude                                            float64
Prior Tax Year Market Value Estimate (Land)         float64
Prior Tax Year Market Value Estimate (Building)     float64
Land Square Feet                                      int64
Building Square Feet                                float64
Age                                                   int64
Property Class_200.0                                  uint8
Property Class_201.0                                  uint8
Property Class_202.0                                  uint8
Property Class_203.0                                  uint8
Property Class_204.0                                  uint8
Proper

Unnamed: 0,GEOID,geometry,index_right,PIN,Longitude,Latitude,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Building Square Feet,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799426,11291010331123,-87.666674,42.022737,1700.0,22190.0,61111,,...,0,0,1,0,0,0,1,0,0,1
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799427,11291010331101,-87.666674,42.022737,21780.0,283620.0,61111,,...,0,0,1,0,0,0,1,0,0,1
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799356,11291010331130,-87.666674,42.022737,940.0,12320.0,61111,,...,0,0,1,0,0,0,1,0,0,1
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799428,11291010331118,-87.666674,42.022737,1180.0,15410.0,61111,,...,0,0,1,0,0,0,1,0,0,1
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799357,11291010331103,-87.666674,42.022737,24140.0,314380.0,61111,,...,0,0,1,0,0,0,1,0,0,1


In [39]:
check = assessments_wide == 0
# Drop any columns that are all 0. 
to_drop = list(check.all()[check.all() == True].index)
assessments_wide.drop(columns=to_drop, inplace=True)
print(f'Dropped the following columns that were all 0:\n{to_drop}')

Dropped the following columns that were all 0:
['Property Class_nan']


In [40]:
# So helpful: https://stackoverflow.com/a/47103408/8527838

cols = list(assessments_wide.columns.values)
cols.pop(cols.index('GEOID'))
assessments_wide = assessments_wide[cols + ['GEOID']]
# First series of aggregations (all that don't use sum())
assessments_1a = assessments_wide.groupby(['GEOID']).agg({
    'PIN': 'count', 
    'Prior Tax Year Market Value Estimate (Land)': ['mean', 'median'], 
    'Prior Tax Year Market Value Estimate (Building)': ['mean', 'median'], 
    'Land Square Feet': ['mean', 'median'],
    'Building Square Feet': ['mean', 'median'], 
    'Age': ['mean', 'median']    
})
assessments_1a

Unnamed: 0_level_0,PIN,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Land Square Feet,Building Square Feet,Building Square Feet,Age,Age
Unnamed: 0_level_1,count,mean,median,mean,median,mean,median,mean,median,mean,median
GEOID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
170310101001,247,37161.255061,14120.0,165053.643725,171860.0,32474.959514,9860.0,3244.918367,2110.0,51.226721,64.0
170310101002,148,28536.081081,17080.0,151666.621622,101180.0,9868.878378,6246.0,4527.851852,4797.0,93.689189,97.0
170310101003,477,16814.549266,10540.0,130144.486373,121130.0,18124.790356,14913.0,3361.833333,1494.0,78.140461,98.0
170310102011,283,28919.399293,17030.0,164135.689046,129920.0,7054.893993,6000.0,2308.084906,1641.0,70.293286,64.0
170310102012,739,25685.236806,17070.0,122439.945873,99730.0,13236.216509,9334.0,2936.382166,2542.0,72.483085,65.0
...,...,...,...,...,...,...,...,...,...,...,...
170318439001,95,21181.684211,11470.0,152839.789474,150800.0,6474.157895,5069.0,4460.357143,4315.5,95.757895,108.0
170318439002,71,31305.211268,27940.0,152449.718310,162210.0,4264.859155,4611.0,4538.304348,4085.5,107.436620,113.0
170318439003,413,5240.314770,2320.0,40202.227603,37270.0,40131.932203,48225.0,6684.000000,6684.0,54.159806,48.0
170318439004,152,26058.552632,15000.0,76780.723684,81670.0,19798.881579,18810.0,6038.227273,6243.0,87.065789,89.0


In [41]:
# Second series of aggregations (all that use sum())
assessments_1b = (assessments_wide
    .loc[:,'Property Class_200.0':'GEOID']
    .groupby('GEOID').sum())
assessments_1b

Unnamed: 0_level_0,Property Class_200.0,Property Class_201.0,Property Class_202.0,Property Class_203.0,Property Class_204.0,Property Class_205.0,Property Class_206.0,Property Class_207.0,Property Class_208.0,Property Class_209.0,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,0.0,0.0,0.0,5.0,3.0,13.0,8.0,0.0,0.0,0.0,...,2.0,0.0,198.0,1.0,47.0,1.0,198.0,0.0,0.0,247.0
170310101002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,121.0,0.0,26.0,1.0,121.0,0.0,0.0,148.0
170310101003,0.0,0.0,0.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,441.0,3.0,33.0,0.0,441.0,1.0,0.0,476.0
170310102011,0.0,0.0,1.0,15.0,4.0,11.0,3.0,0.0,0.0,0.0,...,0.0,0.0,177.0,0.0,105.0,1.0,177.0,0.0,0.0,283.0
170310102012,0.0,1.0,3.0,16.0,4.0,2.0,2.0,0.0,0.0,0.0,...,3.0,2.0,582.0,0.0,156.0,1.0,582.0,0.0,0.0,739.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170318439001,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,...,0.0,0.0,67.0,0.0,26.0,2.0,67.0,0.0,0.0,95.0
170318439002,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,1.0,...,0.0,0.0,25.0,0.0,46.0,0.0,25.0,1.0,0.0,70.0
170318439003,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,411.0,0.0,2.0,0.0,411.0,0.0,0.0,413.0
170318439004,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,130.0,0.0,19.0,3.0,130.0,0.0,0.0,152.0


In [42]:
# Combine the two different data pieces
assessments_block_group = assessments_1a.join(other=assessments_1b)
pipeline.show(assessments_block_group)


Shape:
(2156, 48)

Data Types:
(PIN, count)                                                   int64
(Prior Tax Year Market Value Estimate (Land), mean)          float64
(Prior Tax Year Market Value Estimate (Land), median)        float64
(Prior Tax Year Market Value Estimate (Building), mean)      float64
(Prior Tax Year Market Value Estimate (Building), median)    float64
(Land Square Feet, mean)                                     float64
(Land Square Feet, median)                                   float64
(Building Square Feet, mean)                                 float64
(Building Square Feet, median)                               float64
(Age, mean)                                                  float64
(Age, median)                                                float64
Property Class_200.0                                         float64
Property Class_201.0                                         float64
Property Class_202.0                                         float64
Pr



Unnamed: 0_level_0,"(PIN, count)","(Prior Tax Year Market Value Estimate (Land), mean)","(Prior Tax Year Market Value Estimate (Land), median)","(Prior Tax Year Market Value Estimate (Building), mean)","(Prior Tax Year Market Value Estimate (Building), median)","(Land Square Feet, mean)","(Land Square Feet, median)","(Building Square Feet, mean)","(Building Square Feet, median)","(Age, mean)",...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,247,37161.255061,14120.0,165053.643725,171860.0,32474.959514,9860.0,3244.918367,2110.0,51.226721,...,2.0,0.0,198.0,1.0,47.0,1.0,198.0,0.0,0.0,247.0
170310101002,148,28536.081081,17080.0,151666.621622,101180.0,9868.878378,6246.0,4527.851852,4797.0,93.689189,...,1.0,0.0,121.0,0.0,26.0,1.0,121.0,0.0,0.0,148.0
170310101003,477,16814.549266,10540.0,130144.486373,121130.0,18124.790356,14913.0,3361.833333,1494.0,78.140461,...,0.0,0.0,441.0,3.0,33.0,0.0,441.0,1.0,0.0,476.0
170310102011,283,28919.399293,17030.0,164135.689046,129920.0,7054.893993,6000.0,2308.084906,1641.0,70.293286,...,0.0,0.0,177.0,0.0,105.0,1.0,177.0,0.0,0.0,283.0
170310102012,739,25685.236806,17070.0,122439.945873,99730.0,13236.216509,9334.0,2936.382166,2542.0,72.483085,...,3.0,2.0,582.0,0.0,156.0,1.0,582.0,0.0,0.0,739.0


In [43]:
# Get the proportions for the columns that should have it, dividing by total residences
assessments_block_group.loc[:,'Property Class_200.0':'Renovation_nan'] = (
    assessments_block_group.loc[:,'Property Class_200.0':'Renovation_nan']
        .divide(other=assessments_block_group[('PIN', 'count')], 
                axis=0))
assessments_block_group.head()

Unnamed: 0_level_0,"(PIN, count)","(Prior Tax Year Market Value Estimate (Land), mean)","(Prior Tax Year Market Value Estimate (Land), median)","(Prior Tax Year Market Value Estimate (Building), mean)","(Prior Tax Year Market Value Estimate (Building), median)","(Land Square Feet, mean)","(Land Square Feet, median)","(Building Square Feet, mean)","(Building Square Feet, median)","(Age, mean)",...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,247,37161.255061,14120.0,165053.643725,171860.0,32474.959514,9860.0,3244.918367,2110.0,51.226721,...,0.008097,0.0,0.801619,0.004049,0.190283,0.004049,0.801619,0.0,0.0,1.0
170310101002,148,28536.081081,17080.0,151666.621622,101180.0,9868.878378,6246.0,4527.851852,4797.0,93.689189,...,0.006757,0.0,0.817568,0.0,0.175676,0.006757,0.817568,0.0,0.0,1.0
170310101003,477,16814.549266,10540.0,130144.486373,121130.0,18124.790356,14913.0,3361.833333,1494.0,78.140461,...,0.0,0.0,0.924528,0.006289,0.069182,0.0,0.924528,0.002096,0.0,0.997904
170310102011,283,28919.399293,17030.0,164135.689046,129920.0,7054.893993,6000.0,2308.084906,1641.0,70.293286,...,0.0,0.0,0.625442,0.0,0.371025,0.003534,0.625442,0.0,0.0,1.0
170310102012,739,25685.236806,17070.0,122439.945873,99730.0,13236.216509,9334.0,2936.382166,2542.0,72.483085,...,0.00406,0.002706,0.787551,0.0,0.211096,0.001353,0.787551,0.0,0.0,1.0


In [44]:
# Notice I don't export from ACS household median age
# or the 'total' variables. 
final_assessments = (acs_g.loc[:,[
    'GEOID', 'hh_size', 'med_income', 'occ_units', 'med_rent', 
    'oo_hsng_un', 'perc_white', 'perc_non_w', 'perc_black', 'perc_owner']]
    .merge(
    right=assessments_block_group, 
    left_on='GEOID', right_index=True))
pipeline.show(final_assessments)


Shape:
(2156, 58)

Data Types:
GEOID                                                         object
hh_size                                                      float64
med_income                                                   float64
occ_units                                                      int64
med_rent                                                     float64
oo_hsng_un                                                     int64
perc_white                                                   float64
perc_non_w                                                   float64
perc_black                                                   float64
perc_owner                                                   float64
(PIN, count)                                                   int64
(Prior Tax Year Market Value Estimate (Land), mean)          float64
(Prior Tax Year Market Value Estimate (Land), median)        float64
(Prior Tax Year Market Value Estimate (Building), mean)      float64
(P

Unnamed: 0,GEOID,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,perc_non_w,perc_black,perc_owner,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
0,170310101001,1.95,,236,873.0,117,0.574837,0.425163,0.234273,0.495763,...,0.008097,0.0,0.801619,0.004049,0.190283,0.004049,0.801619,0.0,0.0,1.0
1,170310101002,2.26,21827.0,1054,799.0,53,0.307756,0.692244,0.636551,0.050285,...,0.006757,0.0,0.817568,0.0,0.175676,0.006757,0.817568,0.0,0.0,1.0
2,170310101003,1.5,54297.0,1073,1071.0,327,0.663361,0.336639,0.249125,0.304753,...,0.0,0.0,0.924528,0.006289,0.069182,0.0,0.924528,0.002096,0.0,0.997904
3,170310102011,2.3,42778.0,712,1097.0,224,0.280774,0.719226,0.436694,0.314607,...,0.0,0.0,0.625442,0.0,0.371025,0.003534,0.625442,0.0,0.0,1.0
4,170310102012,2.69,39535.0,1424,1152.0,353,0.54293,0.45707,0.300637,0.247893,...,0.00406,0.002706,0.787551,0.0,0.211096,0.001353,0.787551,0.0,0.0,1.0


In [17]:
# Export assessment data block group
final_assessments.to_csv('data/final_assessments_block_group.csv', sep='\t')

# Block-Level Data Preparation

In [18]:
# Export assessment data at literal block level using address obfuscation 
# like the water set does. 

pipeline.show(assessments_in_chi)


Shape:
(728543, 17)

Data Types:
GEOID                                                object
geometry                                           geometry
index_right                                           int64
Property Address                                     object
PIN                                                   int64
Longitude                                           float64
Latitude                                            float64
Property Class                                        int64
Wall Material                                       float64
Roof Material                                       float64
Repair Condition                                    float64
Renovation                                          float64
Prior Tax Year Market Value Estimate (Land)         float64
Prior Tax Year Market Value Estimate (Building)     float64
Land Square Feet                                      int64
Building Square Feet                                float64
Age   

Unnamed: 0,GEOID,geometry,index_right,Property Address,PIN,Longitude,Latitude,Property Class,Wall Material,Roof Material,Repair Condition,Renovation,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Building Square Feet,Age
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799426,7550 N SHERIDAN RD,11291010331123,-87.666674,42.022737,299,,,,,1700.0,22190.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799427,7550 N SHERIDAN RD,11291010331101,-87.666674,42.022737,299,,,,,21780.0,283620.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799356,7550 N SHERIDAN RD,11291010331130,-87.666674,42.022737,299,,,,,940.0,12320.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799428,7550 N SHERIDAN RD,11291010331118,-87.666674,42.022737,299,,,,,1180.0,15410.0,61111,,15
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799357,7550 N SHERIDAN RD,11291010331103,-87.666674,42.022737,299,,,,,24140.0,314380.0,61111,,15


In [19]:
assessments_in_chi['Address Obfuscated'] = (
    assessments_in_chi[['Property Address']]
    .replace(to_replace={
        '\d{2}(?=\s)': 'XX', 
        '(?<!\d)\d{1}(?=\s)': 'X'}, regex=True))
addresses = pipeline.group_count(
    df=assessments_in_chi, groupby='Address Obfuscated').sort_index()
pipeline.show(assessments_in_chi)


Shape:
(728543, 18)

Data Types:
GEOID                                                object
geometry                                           geometry
index_right                                           int64
Property Address                                     object
PIN                                                   int64
Longitude                                           float64
Latitude                                            float64
Property Class                                        int64
Wall Material                                       float64
Roof Material                                       float64
Repair Condition                                    float64
Renovation                                          float64
Prior Tax Year Market Value Estimate (Land)         float64
Prior Tax Year Market Value Estimate (Building)     float64
Land Square Feet                                      int64
Building Square Feet                                float64
Age   

Unnamed: 0,GEOID,geometry,index_right,Property Address,PIN,Longitude,Latitude,Property Class,Wall Material,Roof Material,Repair Condition,Renovation,Prior Tax Year Market Value Estimate (Land),Prior Tax Year Market Value Estimate (Building),Land Square Feet,Building Square Feet,Age,Address Obfuscated
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799426,7550 N SHERIDAN RD,11291010331123,-87.666674,42.022737,299,,,,,1700.0,22190.0,61111,,15,75XX N SHERIDAN RD
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799427,7550 N SHERIDAN RD,11291010331101,-87.666674,42.022737,299,,,,,21780.0,283620.0,61111,,15,75XX N SHERIDAN RD
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799356,7550 N SHERIDAN RD,11291010331130,-87.666674,42.022737,299,,,,,940.0,12320.0,61111,,15,75XX N SHERIDAN RD
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799428,7550 N SHERIDAN RD,11291010331118,-87.666674,42.022737,299,,,,,1180.0,15410.0,61111,,15,75XX N SHERIDAN RD
0,170310101001,"POLYGON ((-87.67009 42.02115, -87.67047 42.021...",799357,7550 N SHERIDAN RD,11291010331103,-87.666674,42.022737,299,,,,,24140.0,314380.0,61111,,15,75XX N SHERIDAN RD


In [20]:
# I thought about writing a function since I'm copying this code again, 
# but that would have taken away some of the pipeline.show() functionality
# which is important to see the first time through this notebook.  

# Again, dummify
assessments_block_lit_wide = (pd.get_dummies(
        data=assessments_in_chi, columns=cols_to_dummify, dummy_na=True)
    .drop(columns=['Property Address']))

cols = list(assessments_block_lit_wide.columns.values)
cols.pop(cols.index('Address Obfuscated'))
assessments_block_lit_wide = assessments_block_lit_wide[cols + ['Address Obfuscated']]

# First series of aggregations (all that don't use sum())
assessments_block_lit_1a = assessments_block_lit_wide.groupby(['Address Obfuscated']).agg({
    'PIN': 'count', 
    'Prior Tax Year Market Value Estimate (Land)': ['mean', 'median'], 
    'Prior Tax Year Market Value Estimate (Building)': ['mean', 'median'], 
    'Land Square Feet': ['mean', 'median'],
    'Building Square Feet': ['mean', 'median'], 
    'Age': ['mean', 'median']    
})
assessments_block_lit_1a

# Second series of aggregations (all that use sum())
assessments_block_lit_1b = (assessments_block_lit_wide
    .loc[:,'Property Class_200.0':'Address Obfuscated']
    .groupby('Address Obfuscated').sum())
assessments_block_lit_1b

Unnamed: 0_level_0,Property Class_200.0,Property Class_201.0,Property Class_202.0,Property Class_203.0,Property Class_204.0,Property Class_205.0,Property Class_206.0,Property Class_207.0,Property Class_208.0,Property Class_209.0,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
Address Obfuscated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100XX S ABERDEEN ST,0.0,0.0,7.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0,37.0
100XX S ARTESIAN AVE,0.0,0.0,0.0,7.0,1.0,18.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,30.0
100XX S AVENUE L,0.0,2.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,1.0,31.0,0.0,3.0,0.0,0.0,35.0
100XX S AVENUE M,0.0,3.0,11.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,34.0,0.0,5.0,0.0,0.0,39.0
100XX S AVENUE N,0.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,15.0,1.0,2.0,0.0,0.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XX W MAPLE ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
XX W MARQUETTE RD,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0
XX W OAK ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,45.0,0.0,1.0,0.0,45.0,0.0,0.0,46.0
XX W ONTARIO ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,703.0,0.0,0.0,0.0,703.0,0.0,0.0,703.0


In [21]:
# Combine the two different data pieces
assessments_block_lit = assessments_block_lit_1a.join(
    other=assessments_block_lit_1b)

# Get the proportions for the columns that should have it, dividing by total residences
assessments_block_lit.loc[:,'Property Class_200.0':'Renovation_nan'] = (
    assessments_block_lit.loc[:,'Property Class_200.0':'Renovation_nan']
        .divide(other=assessments_block_lit[('PIN', 'count')], 
                axis=0))
final_assessments_block_lit = assessments_block_lit

# # Notice I don't export from ACS household median age
# # or the 'total' variables. 
# final_assessments_block_lit = (acs_g.loc[:,[
#     'GEOID', 'hh_size', 'med_income', 'occ_units', 'med_rent', 
#     'oo_hsng_un', 'perc_white', 'perc_non_w', 'perc_black', 'perc_owner']]
#     .merge(
#     right=assessments_block_lit,
#     left_on='GEOID', right_index=True))
pipeline.show(final_assessments_block_lit)


Shape:
(25034, 49)

Data Types:
(PIN, count)                                                   int64
(Prior Tax Year Market Value Estimate (Land), mean)          float64
(Prior Tax Year Market Value Estimate (Land), median)        float64
(Prior Tax Year Market Value Estimate (Building), mean)      float64
(Prior Tax Year Market Value Estimate (Building), median)    float64
(Land Square Feet, mean)                                     float64
(Land Square Feet, median)                                   float64
(Building Square Feet, mean)                                 float64
(Building Square Feet, median)                               float64
(Age, mean)                                                  float64
(Age, median)                                                float64
Property Class_200.0                                         float64
Property Class_201.0                                         float64
Property Class_202.0                                         float64
P



Unnamed: 0_level_0,"(PIN, count)","(Prior Tax Year Market Value Estimate (Land), mean)","(Prior Tax Year Market Value Estimate (Land), median)","(Prior Tax Year Market Value Estimate (Building), mean)","(Prior Tax Year Market Value Estimate (Building), median)","(Land Square Feet, mean)","(Land Square Feet, median)","(Building Square Feet, mean)","(Building Square Feet, median)","(Age, mean)",...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
Address Obfuscated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100XX S ABERDEEN ST,37,27376.216216,24500.0,80191.081081,80770.0,3650.432432,3267.0,1044.513514,1056.0,76.783784,...,0.054054,0.054054,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
100XX S ARTESIAN AVE,30,48554.666667,47500.0,213167.0,204625.0,5111.066667,5000.0,1509.2,1345.0,68.766667,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
100XX S AVENUE L,35,24846.285714,25000.0,65371.714286,73960.0,3105.857143,3125.0,2019.8125,2001.5,112.2,...,0.0,0.0,0.085714,0.028571,0.885714,0.0,0.085714,0.0,0.0,1.0
100XX S AVENUE M,39,26409.74359,25000.0,59992.051282,58410.0,3301.25641,3125.0,1488.411765,1195.0,110.307692,...,0.0,0.0,0.128205,0.0,0.871795,0.0,0.128205,0.0,0.0,1.0
100XX S AVENUE N,18,26388.888889,25000.0,57832.222222,54750.0,3298.611111,3125.0,1729.6875,1027.0,118.222222,...,0.0,0.0,0.111111,0.0,0.833333,0.055556,0.111111,0.0,0.0,1.0


In [46]:
final_assessments_block_lit[[('PIN', 'count')]].describe()

NameError: name 'final_assessments_block_lit' is not defined

In [23]:
# Export assessment data literal block
final_assessments_block_lit.to_csv('data/final_assessments_block_lit.csv', sep='\t')

# Block Group Preliminary Modeling

In [48]:
# Valeria's New DF: 
water_clean = pd.read_csv('data/final_df.csv', index_col=0)
print(water_clean['threshold_high'].sum())
print(water_clean['threshold_medium'].sum())
pipeline.show(water_clean)

42
744

Shape:
(2194, 64)

Data Types:
GEOID                                                              int64
avg_reading_all                                                  float64
max_reading_all                                                  float64
sample_cnt                                                       float64
hh_size                                                          float64
med_income                                                       float64
occ_units                                                        float64
med_rent                                                         float64
oo_hsng_un                                                       float64
perc_white                                                       float64
perc_non_w                                                       float64
perc_black                                                       float64
perc_owner                                                       float64
tot_pop     

Unnamed: 0,GEOID,avg_reading_all,max_reading_all,sample_cnt,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,...,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan,threshold_high,threshold_medium
0,170310101001,2.133333,2.6,7.0,1.95,,236.0,873.0,117.0,0.574837,...,0.801619,0.004049,0.190283,0.004049,0.801619,0.0,0.0,1.0,0,0
1,170310101002,,,0.0,2.26,21827.0,1054.0,799.0,53.0,0.307756,...,0.817568,0.0,0.175676,0.006757,0.817568,0.0,0.0,1.0,0,0
2,170310101003,1.193333,1.58,10.0,1.5,54297.0,1073.0,1071.0,327.0,0.663361,...,0.924528,0.006289,0.069182,0.0,0.924528,0.002096,0.0,0.997904,0,0
3,170310102011,2.794444,4.016667,12.0,2.3,42778.0,712.0,1097.0,224.0,0.280774,...,0.625442,0.0,0.371025,0.003534,0.625442,0.0,0.0,1.0,0,0
4,170310102012,1.716667,2.533333,18.0,2.69,39535.0,1424.0,1152.0,353.0,0.54293,...,0.787551,0.0,0.211096,0.001353,0.787551,0.0,0.0,1.0,0,0


In [52]:
water_clean['sample_cnt'].describe()

KeyError: 'sample_cnt'

In [73]:
# James Work: 
water_clean = gpd.read_file('data/water_quality_clean.shp')
water_clean = water_clean.set_crs('EPSG:4326')
pipeline.show(water_clean)


Shape:
(22668, 13)

Data Types:
Date Sampl      object
Address         object
1st Draw       float64
2-3 Minute     float64
5 Minute       float64
location        object
latitude       float64
longitude      float64
avg_readin     float64
max_readin     float64
t_high           int64
t_med            int64
geometry      geometry
dtype: object


Unnamed: 0,Date Sampl,Address,1st Draw,2-3 Minute,5 Minute,location,latitude,longitude,avg_readin,max_readin,t_high,t_med,geometry
0,2020-01-26 07:42:00,"1 N Bishop St, Chicago, IL, USA",1.0,1.0,1.0,"1, North Bishop Street, Near West Side, Chicag...",41.881667,-87.663587,1.0,1.0,0,0,POINT (-87.66359 41.88167)
1,2016-11-08 05:30:00,"10 N Lasalle St, Chicago, IL, USA",1.0,1.0,1.0,"10, North LaSalle Street, Loop, Chicago, Cook ...",41.882197,-87.632477,1.0,1.0,0,0,POINT (-87.63248 41.88220)
2,2019-11-07 06:18:00,"1 W Brayton St, Chicago, IL, USA",1.0,6.9,1.0,"1, West Brayton Street, Cookes Subdivision, We...",41.665199,-87.622381,2.966667,6.9,0,1,POINT (-87.62238 41.66520)
3,2020-06-07 07:58:00,"1 W Superior St, Chicago, IL, USA",1.0,1.0,1.0,"One Superior Place, 1, West Superior Street, C...",41.895201,-87.628955,1.0,1.0,0,0,POINT (-87.62896 41.89520)
4,2020-10-26 07:45:00,"1 W Superior St, Chicago, IL, USA",1.0,1.0,1.0,"One Superior Place, 1, West Superior Street, C...",41.895201,-87.628955,1.0,1.0,0,0,POINT (-87.62896 41.89520)


In [99]:
water_block_group_temp = gpd.sjoin(left_df=acs_g, right_df=water_clean, how='inner', op='intersects')
print(water_block_group_temp['t_high'].sum())
pipeline.show(water_block_group_temp)

774

Shape:
(22023, 32)

Data Types:
FIPS            float64
Geographic       object
Census Tra        int64
Block Grou        int64
tot_pop           int64
tot_white         int64
tot_black         int64
hh_size         float64
med_income      float64
occ_units         int64
med_built_      float64
med_rent        float64
oo_hsng_un        int64
GEOID            object
perc_white      float64
perc_non_w      float64
perc_black      float64
perc_owner      float64
geometry       geometry
index_right       int64
Date Sampl       object
Address          object
1st Draw        float64
2-3 Minute      float64
5 Minute        float64
location         object
latitude        float64
longitude       float64
avg_readin      float64
max_readin      float64
t_high            int64
t_med             int64
dtype: object


Unnamed: 0,FIPS,Geographic,Census Tra,Block Grou,tot_pop,tot_white,tot_black,hh_size,med_income,occ_units,...,1st Draw,2-3 Minute,5 Minute,location,latitude,longitude,avg_readin,max_readin,t_high,t_med
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,...,1.0,1.1,1.1,"1500, West Jonquil Terrace, Rogers Park, Chica...",42.02122,-87.668639,1.066667,1.1,0,0
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,...,1.0,1.0,1.0,"1500, West Jonquil Terrace, Rogers Park, Chica...",42.02122,-87.668639,1.0,1.0,0,0
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,...,1.0,1.0,1.0,"1500, West Jonquil Terrace, Rogers Park, Chica...",42.02122,-87.668639,1.0,1.0,0,0
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,...,1.0,1.9,1.0,"1400, West Juneway Terrace, Rogers Park, Chica...",42.022314,-87.667019,1.3,1.9,0,0
0,170000000000.0,15000US170310101001,10100,1,461,265,108,1.95,,236,...,4.8,5.2,4.0,"1500, West Juneway Terrace, Rogers Park, Chica...",42.022475,-87.66838,4.666667,5.2,0,1


In [None]:
water_block_group = water_block_group_temp.groupby('GEOID').agg({
    'Address': 'count', 
    't_high': 'sum', 
    't_med': 'sum', 
    '1st Draw': 'mean', 
    '2-3 Minute': 'mean', 
    '5 Minute': 'mean'
})
pipeline.show(water_block_group)

In [95]:
water_clean[['t_high', 't_med']].describe()

Unnamed: 0,t_high,t_med
count,22668.0,22668.0
mean,0.044424,0.335539
std,0.206039,0.472189
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,1.0
max,1.0,1.0


In [56]:
water_block_group['t_high_binary'] = np.where(
    water_block_group['t_high'] >= 1, 1, 0)
water_block_group['t_med_binary'] = np.where(
    water_block_group['t_med'] >= 1, 1, 0)

print(water_block_group['t_high_binary'].sum())
print(water_block_group['t_med_binary'].sum())
water_block_group

543
1524


Unnamed: 0_level_0,Address,t_high,t_med,1st Draw,2-3 Minute,5 Minute,t_high_binary,t_med_binary
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
170310101001,7,0,2,2.071429,2.600000,1.728571,0,1
170310101003,10,0,0,1.320000,1.260000,1.000000,0,0
170310102011,12,0,4,3.000000,3.675000,1.708333,0,1
170310102012,18,0,3,1.577778,2.283333,1.288889,0,1
170310102013,13,0,3,2.184615,2.261538,1.592308,0,1
...,...,...,...,...,...,...,...,...
170318439002,3,0,0,1.233333,1.466667,1.600000,0,0
170318439003,5,0,0,1.000000,1.000000,1.000000,0,0
170318439004,1,0,0,1.400000,1.100000,1.000000,0,0
170318439005,2,0,0,2.400000,1.550000,1.850000,0,0


In [57]:
water_block_group = pd.merge(
    left=water_block_group, right=final_assessments, 
    left_index=True, right_on='GEOID', how='inner').set_index('GEOID')
water_classification = water_block_group.drop(columns=[
    'Address', 't_high', 't_med', '1st Draw', '2-3 Minute', '5 Minute'])
water_classification

Unnamed: 0_level_0,t_high_binary,t_med_binary,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,perc_non_w,perc_black,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,0,1,1.95,,236,873.0,117,0.574837,0.425163,0.234273,...,0.008097,0.000000,0.801619,0.004049,0.190283,0.004049,0.801619,0.000000,0.0,1.000000
170310101003,0,0,1.50,54297.0,1073,1071.0,327,0.663361,0.336639,0.249125,...,0.000000,0.000000,0.924528,0.006289,0.069182,0.000000,0.924528,0.002096,0.0,0.997904
170310102011,0,1,2.30,42778.0,712,1097.0,224,0.280774,0.719226,0.436694,...,0.000000,0.000000,0.625442,0.000000,0.371025,0.003534,0.625442,0.000000,0.0,1.000000
170310102012,0,1,2.69,39535.0,1424,1152.0,353,0.542930,0.457070,0.300637,...,0.004060,0.002706,0.787551,0.000000,0.211096,0.001353,0.787551,0.000000,0.0,1.000000
170310102013,0,1,2.99,52948.0,611,1023.0,114,0.465461,0.534539,0.326206,...,0.000000,0.009174,0.694190,0.000000,0.305810,0.000000,0.694190,0.000000,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170318438002,0,1,2.26,31635.0,326,933.0,233,0.000000,1.000000,0.989160,...,0.015936,0.015936,0.027888,0.000000,0.944223,0.027888,0.027888,0.000000,0.0,1.000000
170318439002,0,0,2.02,,213,841.0,27,0.026201,0.973799,0.973799,...,0.000000,0.000000,0.352113,0.000000,0.647887,0.000000,0.352113,0.014085,0.0,0.985915
170318439003,0,0,1.22,34079.0,640,1103.0,311,0.125160,0.874840,0.810983,...,0.002421,0.000000,0.995157,0.000000,0.004843,0.000000,0.995157,0.000000,0.0,1.000000
170318439004,0,0,1.65,,319,743.0,158,0.028517,0.971483,0.918251,...,0.000000,0.000000,0.855263,0.000000,0.125000,0.019737,0.855263,0.000000,0.0,1.000000


In [58]:
water_classification = pipeline.impute_missing(water_classification)
# Of note, all property class 299 (condominiums) have NaN for building square feet, 
# but many other property classes do as well. 
# Total Building Square Feet was unfortunately all NaN for Property Class 299, despite 
# what the documentation said. 

Contains NA Values:
t_high_binary                                                False
t_med_binary                                                 False
hh_size                                                       True
med_income                                                    True
occ_units                                                    False
med_rent                                                      True
oo_hsng_un                                                   False
perc_white                                                   False
perc_non_w                                                   False
perc_black                                                   False
perc_owner                                                    True
(PIN, count)                                                 False
(Prior Tax Year Market Value Estimate (Land), mean)          False
(Prior Tax Year Market Value Estimate (Land), median)        False
(Prior Tax Year Market Value Estimate (Bui

In [59]:
scaler = pipeline.normalized_values(
    water_classification, ignore=['t_high_binary', 't_med_binary'], quiet=True)

In [60]:
pipeline.normalize(
    water_classification, scaler, ignore=['t_high_binary', 't_med_binary'], 
    inplace=True)

Unnamed: 0_level_0,t_high_binary,t_med_binary,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,perc_non_w,perc_black,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,0,1,-1.005069,-0.234237,-0.852549,-0.761010,-0.660678,0.296896,-0.296896,-0.271678,...,0.276038,-0.406439,1.660715,0.024643,-1.650931,-0.207227,1.660715,-0.417538,-0.022076,0.417348
170310101003,0,0,-1.658708,-0.280366,1.887661,-0.215095,0.592835,0.559884,-0.559884,-0.234747,...,-0.417102,-0.406439,2.044203,0.235293,-2.028067,-0.556948,2.044203,0.017115,-0.022076,-0.016810
170310102011,0,1,-0.496683,-0.597590,0.705802,-0.143409,-0.021984,-0.576720,0.576720,0.231678,...,-0.417102,-0.406439,1.111021,-0.355965,-1.088062,-0.251715,1.111021,-0.417538,-0.022076,0.417348
170310102012,0,1,0.069804,-0.686900,3.036781,0.008234,0.748032,0.202104,-0.202104,-0.106652,...,-0.069594,-0.230854,1.616819,-0.355965,-1.586116,-0.440059,1.616819,-0.417538,-0.022076,0.417348
170310102013,0,1,0.505563,-0.317516,0.375143,-0.347438,-0.678586,-0.028045,0.028045,-0.043070,...,-0.417102,0.188779,1.325522,-0.355965,-1.291154,-0.556948,1.325522,-0.417538,-0.022076,0.417348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170318438002,0,1,-0.554785,-0.904460,-0.557903,-0.595581,0.031738,-1.410855,1.410855,1.605485,...,0.947087,0.627486,-0.753408,-0.355965,0.697008,1.852083,-0.753408,-0.417538,-0.022076,0.417348
170318439002,0,0,-0.903392,-0.234237,-0.927848,-0.849239,-1.197898,-1.333016,1.333016,1.567288,...,-0.417102,-0.406439,0.258206,-0.355965,-0.225849,-0.556948,0.258206,2.502594,-0.022076,-2.499459
170318439003,0,0,-2.065416,-0.837154,0.470085,-0.126866,0.497329,-1.039025,1.039025,1.162417,...,-0.209831,-0.406439,2.264573,-0.355965,-2.228436,-0.556948,2.264573,-0.417538,-0.022076,0.417348
170318439004,0,0,-1.440828,-0.234237,-0.580820,-1.119439,-0.415945,-1.326135,1.326135,1.429157,...,-0.417102,-0.406439,1.828089,-0.355965,-1.854239,1.147940,1.828089,-0.417538,-0.022076,0.417348


In [61]:
water_classification

Unnamed: 0_level_0,t_high_binary,t_med_binary,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,perc_non_w,perc_black,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170310101001,0,1,1.95,55972.0,236,873.0,117,0.574837,0.425163,0.234273,...,0.008097,0.000000,0.801619,0.004049,0.190283,0.004049,0.801619,0.000000,0.0,1.000000
170310101003,0,0,1.50,54297.0,1073,1071.0,327,0.663361,0.336639,0.249125,...,0.000000,0.000000,0.924528,0.006289,0.069182,0.000000,0.924528,0.002096,0.0,0.997904
170310102011,0,1,2.30,42778.0,712,1097.0,224,0.280774,0.719226,0.436694,...,0.000000,0.000000,0.625442,0.000000,0.371025,0.003534,0.625442,0.000000,0.0,1.000000
170310102012,0,1,2.69,39535.0,1424,1152.0,353,0.542930,0.457070,0.300637,...,0.004060,0.002706,0.787551,0.000000,0.211096,0.001353,0.787551,0.000000,0.0,1.000000
170310102013,0,1,2.99,52948.0,611,1023.0,114,0.465461,0.534539,0.326206,...,0.000000,0.009174,0.694190,0.000000,0.305810,0.000000,0.694190,0.000000,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170318438002,0,1,2.26,31635.0,326,933.0,233,0.000000,1.000000,0.989160,...,0.015936,0.015936,0.027888,0.000000,0.944223,0.027888,0.027888,0.000000,0.0,1.000000
170318439002,0,0,2.02,55972.0,213,841.0,27,0.026201,0.973799,0.973799,...,0.000000,0.000000,0.352113,0.000000,0.647887,0.000000,0.352113,0.014085,0.0,0.985915
170318439003,0,0,1.22,34079.0,640,1103.0,311,0.125160,0.874840,0.810983,...,0.002421,0.000000,0.995157,0.000000,0.004843,0.000000,0.995157,0.000000,0.0,1.000000
170318439004,0,0,1.65,55972.0,319,743.0,158,0.028517,0.971483,0.918251,...,0.000000,0.000000,0.855263,0.000000,0.125000,0.019737,0.855263,0.000000,0.0,1.000000


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.pipeline import Pipeline as pipe_sklearn
training, test = train_test_split(water_classification, test_size=0.2, random_state=0)
training_high_x, training_high_y = (training.drop(columns=['t_high_binary', 't_med_binary']),
                          training['t_high_binary'])
test_high_x, test_high_y = (test.drop(columns=['t_high_binary', 't_med_binary']), 
                        test['t_high_binary'])
training_med_x, training_med_y = (training.drop(columns=['t_high_binary', 't_med_binary']),
                          training['t_med_binary'])
test_med_x, test_med_y = (test.drop(columns=['t_high_binary', 't_med_binary']), 
                        test['t_med_binary'])

In [86]:
pipe = pipe_sklearn([
    ('LogisticRegression', LogisticRegression(max_iter=1000))
])
params = {
    'LogisticRegression__penalty': ['l1', 'l2', 'none'], 
    'LogisticRegression__solver': ['lbfgs', 'liblinear'],
    'LogisticRegression__C': [1000, 100, 10, 1, 0.1, 0.01, 0.001]
}
k = 10
grid_model_high = GridSearchCV(estimator=pipe, 
                          param_grid=params, 
                          cv=10, 
                          scoring=['accuracy', 'precision', 'recall'], 
                          refit='accuracy')
grid_model_high_result = grid_model.fit(training_high_x, training_high_y)

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everythi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 454, in _check_solver
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 454, in _check_solver
    raise ValueError(
ValueError: penalty='none' is not supported for the liblinear solver

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in f

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [87]:
results_high_df = pd.DataFrame.from_dict(grid_model_high_result.cv_results_)[[
    'param_LogisticRegression__C', 'param_LogisticRegression__penalty',
    'param_LogisticRegression__solver',
    'mean_test_accuracy', 'rank_test_accuracy', 'mean_test_precision', 
    'rank_test_precision', 'mean_test_recall', 'rank_test_recall'
]]
# Gets rid of models where the solvers and params are incompatible
results_high_df.dropna(axis=0) 

Unnamed: 0,param_LogisticRegression__C,param_LogisticRegression__penalty,param_LogisticRegression__solver,mean_test_accuracy,rank_test_accuracy,mean_test_precision,rank_test_precision,mean_test_recall,rank_test_recall
1,1000.0,l1,liblinear,0.750318,5,0.558673,18,0.245349,2
2,1000.0,l2,lbfgs,0.748496,10,0.611291,6,0.132082,28
3,1000.0,l2,liblinear,0.742395,21,0.535575,21,0.168975,6
4,1000.0,none,lbfgs,0.746061,12,0.607407,7,0.134355,21
7,100.0,l1,liblinear,0.750314,7,0.559249,17,0.245349,2
8,100.0,l2,lbfgs,0.750325,3,0.627149,4,0.143552,15
9,100.0,l2,liblinear,0.738744,27,0.514358,26,0.1574,13
10,100.0,none,lbfgs,0.746061,12,0.607407,7,0.134355,21
13,10.0,l1,liblinear,0.753969,1,0.571714,15,0.247674,1
14,10.0,l2,lbfgs,0.750325,3,0.627983,3,0.143552,15


In [89]:
p = grid_model_high_result.best_params_
test_score = grid_model_high_result.score(
    X=test_high_x,
    y=test_high_y)
print(f'''
The best model was a Logistic Regression with the following parameters: 
C: {p['LogisticRegression__C']}
Penalty: {p['LogisticRegression__penalty']}
Solver: {p['LogisticRegression__solver']}

It had a mean validation set accuracy of {round(grid_model.best_score_, 4)}.
On the test data it had an accuracy of {round(test_score, 4)}.

This model is index #{grid_model_high_result.best_index_} in the below table. 
''')


The best model was a Logistic Regression with the following parameters: 
C: 10
Penalty: l1
Solver: liblinear

It had a mean validation set accuracy of 0.754.
On the test data it had an accuracy of 0.7664.

This model is index #13 in the below table. 



In [90]:
best_model_high = LogisticRegression(C=10, penalty='l1', solver='liblinear')
best_model_high.fit(X=training_high_x, y=training_high_y)
pipeline.print_coefs(
    best_model_high, df=training.drop(columns=['t_med_binary']), 
    target='t_high_binary', n=100)

Target:
t_high_binary

Intercept:
[0.]

Features and Coefficients:
Property Class_208.0                                         6.643132
Renovation_1.0                                               4.638053
Repair Condition_1.0                                         2.204748
Property Class_204.0                                         2.081024
Property Class_205.0                                         1.865777
Property Class_278.0                                         1.142129
Property Class_234.0                                         0.824083
Wall Material_4.0                                            0.630615
perc_black                                                   0.495236
Property Class_210.0                                         0.118201
perc_owner                                                   0.052230
(Age, mean)                                                  0.005175
(Age, median)                                                0.004491
oo_hsng_un             

In [91]:
grid_model_med = GridSearchCV(estimator=pipe, 
                          param_grid=params, 
                          cv=10, 
                          scoring=['accuracy', 'precision', 'recall'], 
                          refit='accuracy')
grid_model_med_result = grid_model.fit(training_med_x, training_med_y)

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everythi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "d:\everything\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\everything\python\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\everything\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 454, in _check_solver
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,param_LogisticRegression__C,param_LogisticRegression__penalty,param_LogisticRegression__solver,mean_test_accuracy,rank_test_accuracy,mean_test_precision,rank_test_precision,mean_test_recall,rank_test_recall
1,1000.0,l1,liblinear,0.794775,3,0.827609,2,0.914907,28
2,1000.0,l2,lbfgs,0.772853,28,0.790744,26,0.945169,22
3,1000.0,l2,liblinear,0.778936,12,0.792891,12,0.951719,4
4,1000.0,none,lbfgs,0.773463,19,0.790914,18,0.945988,14
7,100.0,l1,liblinear,0.796604,1,0.828896,1,0.915727,27
8,100.0,l2,lbfgs,0.773463,19,0.791309,16,0.945169,22
9,100.0,l2,liblinear,0.780148,8,0.79393,10,0.951719,4
10,100.0,none,lbfgs,0.773463,19,0.790914,18,0.945988,14
13,10.0,l1,liblinear,0.794767,4,0.826623,3,0.916547,26
14,10.0,l2,lbfgs,0.773463,19,0.789336,28,0.949267,12


In [93]:
p = grid_model_med_result.best_params_
test_score = grid_model_med_result.score(
    X=test_med_x,
    y=test_med_y)
print(f'''
The best model was a Logistic Regression with the following parameters: 
C: {p['LogisticRegression__C']}
Penalty: {p['LogisticRegression__penalty']}
Solver: {p['LogisticRegression__solver']}

It had a mean validation set accuracy of {round(grid_model.best_score_, 4)}.
On the test data it had an accuracy of {round(test_score, 4)}.

This model is index #{grid_model_med_result.best_index_} in the below table. 
''')


The best model was a Logistic Regression with the following parameters: 
C: 100
Penalty: l1
Solver: liblinear

It had a mean validation set accuracy of 0.7966.
On the test data it had an accuracy of 0.7956.

This model is index #7 in the below table. 



In [94]:
results_med_df = pd.DataFrame.from_dict(grid_model_med_result.cv_results_)[[
    'param_LogisticRegression__C', 'param_LogisticRegression__penalty',
    'param_LogisticRegression__solver',
    'mean_test_accuracy', 'rank_test_accuracy', 'mean_test_precision', 
    'rank_test_precision', 'mean_test_recall', 'rank_test_recall'
]]
# Gets rid of models where the solvers and params are incompatible
results_med_df.dropna(axis=0) 

Unnamed: 0,param_LogisticRegression__C,param_LogisticRegression__penalty,param_LogisticRegression__solver,mean_test_accuracy,rank_test_accuracy,mean_test_precision,rank_test_precision,mean_test_recall,rank_test_recall
1,1000.0,l1,liblinear,0.794775,3,0.827609,2,0.914907,28
2,1000.0,l2,lbfgs,0.772853,28,0.790744,26,0.945169,22
3,1000.0,l2,liblinear,0.778936,12,0.792891,12,0.951719,4
4,1000.0,none,lbfgs,0.773463,19,0.790914,18,0.945988,14
7,100.0,l1,liblinear,0.796604,1,0.828896,1,0.915727,27
8,100.0,l2,lbfgs,0.773463,19,0.791309,16,0.945169,22
9,100.0,l2,liblinear,0.780148,8,0.79393,10,0.951719,4
10,100.0,none,lbfgs,0.773463,19,0.790914,18,0.945988,14
13,10.0,l1,liblinear,0.794767,4,0.826623,3,0.916547,26
14,10.0,l2,lbfgs,0.773463,19,0.789336,28,0.949267,12


In [100]:
final_assessments

Unnamed: 0,GEOID,hh_size,med_income,occ_units,med_rent,oo_hsng_un,perc_white,perc_non_w,perc_black,perc_owner,...,Roof Material_5.0,Roof Material_6.0,Roof Material_nan,Repair Condition_1.0,Repair Condition_2.0,Repair Condition_3.0,Repair Condition_nan,Renovation_1.0,Renovation_2.0,Renovation_nan
0,170310101001,1.95,,236,873.0,117,0.574837,0.425163,0.234273,0.495763,...,0.008097,0.000000,0.801619,0.004049,0.190283,0.004049,0.801619,0.000000,0.0,1.000000
1,170310101002,2.26,21827.0,1054,799.0,53,0.307756,0.692244,0.636551,0.050285,...,0.006757,0.000000,0.817568,0.000000,0.175676,0.006757,0.817568,0.000000,0.0,1.000000
2,170310101003,1.50,54297.0,1073,1071.0,327,0.663361,0.336639,0.249125,0.304753,...,0.000000,0.000000,0.924528,0.006289,0.069182,0.000000,0.924528,0.002096,0.0,0.997904
3,170310102011,2.30,42778.0,712,1097.0,224,0.280774,0.719226,0.436694,0.314607,...,0.000000,0.000000,0.625442,0.000000,0.371025,0.003534,0.625442,0.000000,0.0,1.000000
4,170310102012,2.69,39535.0,1424,1152.0,353,0.542930,0.457070,0.300637,0.247893,...,0.004060,0.002706,0.787551,0.000000,0.211096,0.001353,0.787551,0.000000,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2187,170318439001,2.13,,364,1009.0,11,0.047254,0.952746,0.923372,0.030220,...,0.000000,0.000000,0.705263,0.000000,0.273684,0.021053,0.705263,0.000000,0.0,1.000000
2188,170318439002,2.02,,213,841.0,27,0.026201,0.973799,0.973799,0.126761,...,0.000000,0.000000,0.352113,0.000000,0.647887,0.000000,0.352113,0.014085,0.0,0.985915
2189,170318439003,1.22,34079.0,640,1103.0,311,0.125160,0.874840,0.810983,0.485938,...,0.002421,0.000000,0.995157,0.000000,0.004843,0.000000,0.995157,0.000000,0.0,1.000000
2190,170318439004,1.65,,319,743.0,158,0.028517,0.971483,0.918251,0.495298,...,0.000000,0.000000,0.855263,0.000000,0.125000,0.019737,0.855263,0.000000,0.0,1.000000
