In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0,'..')

import matplotlib.pyplot as plt
import os.path
import prepare, prepare_sso
import seaborn as sns

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, precision_score, \
                            classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

plt.rc('figure', figsize=(16,10))
plt.rc('font', size=14)
pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_columns', 50)

In [2]:
mf = prepare.filter_sso_features()
mf.head(1)

Unnamed: 0,SSO_ID,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,SPILL_START,SPILL_STOP,HRS,CAUSE,COMMENTS,ACTIONS,WATERSHED,UNITID,UNITID2,DISCHARGE_TO,DISCHARGE_ROUTE,COUNCIL_DISTRICT,Month,Year,Week,EARZ_ZONE,PIPEDIAM,PIPELEN,PIPETYPE,INSTYEAR,Inches_No,RainFall_Less3,SPILL ADDRESS,NUM_SPILLS_COMPKEY,NUM_SPILLS_24MOS,PREVSPILL_24MOS,UNITTYPE,ASSETTYPE,LASTCLND,ResponseTime,ResponseDTTM,Public Notice,Root_Cause,HRS_2,GAL_2,HRS_3,GAL_3
0,6582,2019-03-10 00:00:00,3200,THOUSAND OAKS DR,2100,2100.0,3/10/2019 1:16:00 PM,3/10/2019 2:40:00 PM,1.4,Grease,"Spill ContainedReturned to SystemArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,3,2019,11,0.0,8.0,16.55,PVC,1997.0,,,3200 THOUSAND OAKS DR,1,1.0,,GRAVITY,Sewer Main,,0.45,2019-03-10 13:43:00,False,,0.0,0.0,0.0,0.0


In [3]:
mf['SSO_ID'].astype(str)

0       6582
1       6583
2       6581
3       6584
4       6580
        ... 
3178     371
3179     372
3180     373
3181     375
3182     374
Name: SSO_ID, Length: 3183, dtype: object

In [4]:
string_features = ['SSO_ID','SPILL_ADDRESS','COUNCIL_DISTRICT',]
for col in string_features:
    mf[col] = mf[col].astype(str)

In [6]:
mf = prepare.prepare_sso_df2()
mf.head(1)

Unnamed: 0,sso_id,report_date,spill_address_num,spill_st_name,total_gal,gals_ret,spill_start,spill_stop,hrs,cause,comments,actions,watershed,unit_id,unit_id2,discharge_to,discharge_route,council_district,month,year,week,earz_zone,pipe_diam,pipe_len,pipe_type,inst_year,inches_no,rainfall_last3,spill_address_full,num_spills_recorded,num_spills_24mos,prevspill_24mos,unit_type,asset_type,last_cleaned,response_time,response_dttm,public_notice,root_cause,hrs_2,gal_2,hrs_3,gal_3,days_since_cleaned
0,6582,2019-03-10,3200,THOUSAND OAKS DR,2100,2100.0,2019-03-10 13:16:00,2019-03-10 14:40:00,1.4,Grease,"Spill ContainedReturned to SystemArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,3,2019,11,0.0,8.0,16.55,PVC,1997.0,,,3200 THOUSAND OAKS DR,1,1.0,0,GRAVITY,Sewer Main,NaT,27.0,2019-03-10 13:43:00,False,,0.0,0.0,0.0,0.0,


In [13]:
df = mf[:20]

In [15]:
df.head(1)

Unnamed: 0,sso_id,report_date,spill_address_num,spill_st_name,total_gal,gals_ret,spill_start,spill_stop,hrs,cause,comments,actions,watershed,unit_id,unit_id2,discharge_to,discharge_route,council_district,month,year,week,earz_zone,pipe_diam,pipe_len,pipe_type,inst_year,inches_no,rainfall_last3,spill_address_full,num_spills_recorded,num_spills_24mos,prevspill_24mos,unit_type,asset_type,last_cleaned,response_time,response_dttm,public_notice,root_cause,hrs_2,gal_2,hrs_3,gal_3,days_since_cleaned
0,6582,2019-03-10,3200,THOUSAND OAKS DR,2100,2100.0,2019-03-10 13:16:00,2019-03-10 14:40:00,1.4,Grease,"Spill ContainedReturned to SystemArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,3,2019,11,0.0,8.0,16.55,PVC,1997.0,,,3200 THOUSAND OAKS DR,1,1.0,0,GRAVITY,Sewer Main,NaT,27.0,2019-03-10 13:43:00,False,,0.0,0.0,0.0,0.0,


In [18]:
df.spill_address_full = df.spill_address_full+', San Antonio, TX, USA'

In [19]:
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=.1, 
                    max_retries=10, error_wait_seconds=1)
df['location'] = df['spill_address_full'].apply(geocode)
df['zip_code'] = 'None'
for t,l in enumerate(df.location):
    if l is not None:
        df['zip_code'][t] = l.raw['display_name'].split(',')[-2]

In [20]:
df.head()

Unnamed: 0,sso_id,report_date,spill_address_num,spill_st_name,total_gal,gals_ret,spill_start,spill_stop,hrs,cause,comments,actions,watershed,unit_id,unit_id2,discharge_to,discharge_route,council_district,month,year,week,earz_zone,pipe_diam,pipe_len,pipe_type,inst_year,inches_no,rainfall_last3,spill_address_full,num_spills_recorded,num_spills_24mos,prevspill_24mos,unit_type,asset_type,last_cleaned,response_time,response_dttm,public_notice,root_cause,hrs_2,gal_2,hrs_3,gal_3,days_since_cleaned,location,zip_code
0,6582,2019-03-10,3200,THOUSAND OAKS DR,2100,2100.0,2019-03-10 13:16:00,2019-03-10 14:40:00,1.4,Grease,"Spill ContainedReturned to SystemArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,3,2019,11,0.0,8.0,16.55,PVC,1997.0,,,"3200 THOUSAND OAKS DR, San Antonio, TX, USA",1,1.0,0,GRAVITY,Sewer Main,NaT,27.0,2019-03-10 13:43:00,False,,0.0,0.0,0.0,0.0,,"(3200, Thousand Oaks Drive, Horseshoe Bend, Lost Creek, Austin, Travis County, Texas, 78746, United States of America, (30.2659455, -97.80399402173913))",78746
1,6583,2019-03-10,6804,S FLORES ST,80,0.0,2019-03-10 14:25:00,2019-03-10 15:45:00,1.333333,Grease,Spill ContainedArea Cleaned and Disinfected,CLEANED MAIN,DOS RIOS,24250,24193,STORMDRAIN,,3.0,3,2019,11,0.0,8.0,157.0,PVC,1988.0,,,"6804 S FLORES, San Antonio, TX, USA",1,1.0,0,GRAVITY,Sewer Main,NaT,65.0,2019-03-10 15:30:00,False,,0.0,0.0,0.0,0.0,,"(Flores, West Odessa, Ector County, Texas, United States of America, (31.8547839, -102.5017913))",Texas
2,6581,2019-03-09,215,AUDREY ALENE DR,79,0.0,2019-03-09 18:00:00,2019-03-09 19:30:00,1.5,Structural,"Spill ContainedArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,DOS RIOS,2822,3351,ALLEY,,1.0,3,2019,10,0.0,8.0,350.0,CP,1955.0,,,"215 Audrey Alene Dr, San Antonio, TX, USA",1,1.0,0,GRAVITY,Sewer Main,NaT,60.0,2019-03-09 19:00:00,False,,1.15,69.0,0.0,0.0,,"(215, Audrey Alene Drive, San Antonio, Bexar County, Texas, 78216, United States of America, (29.503348142857142, -98.50358214285714))",78216
3,6584,2019-03-09,3602,SE MILITARY DR,83,0.0,2019-03-09 15:37:00,2019-03-09 17:00:00,1.383333,Grease,"Spill ContainedArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",,SALADO CREEK,92804,92805,EASEMENT,,3.0,3,2019,10,0.0,8.0,213.91,PVC,1983.0,,,"3602 SE MILITARY DR, San Antonio, TX, USA",1,1.0,0,GRAVITY,Sewer Main,NaT,33.0,2019-03-09 16:10:00,False,,0.0,0.0,0.0,0.0,,"(Quality Suites, Southeast Military Drive, Hilltop, San Antonio, Bexar County, Texas, 78223, United States of America, (29.352909500000003, -98.4253423))",78223
4,6580,2019-03-06,100,PANSY LN,75,0.0,2019-03-06 09:40:00,2019-03-06 09:55:00,0.25,Structural,"Spill ContainedArea Cleaned and DisinfectedFlushed Area with H2O, Unstopped Main,",CLEANED MAIN,SALADO CREEK,61141,49543,STREET,,2.0,3,2019,10,0.0,12.0,291.9,CP,1952.0,,,"100 PANSY LN, San Antonio, TX, USA",2,2.0,2018-12-15 00:00:00,GRAVITY,Sewer Main,NaT,0.0,2019-03-06 09:40:00,False,,0.0,0.0,0.0,0.0,,"(Pansy Lane, San Antonio, Bexar County, Texas, 78209, United States of America, (29.488062, -98.435963))",78209


In [11]:
df.zip_code

0      40205
1      S2147
2      78216
3      78223
4      25039
5      78210
6      78245
7      21216
8       3306
9      78201
10     78258
11     78201
12     78261
13     78223
14     95118
15     78242
16     78261
17     78503
18     78242
19     78254
Name: zip_code, dtype: object

In [None]:
if os.path.isfile('sso_dict.csv'):
    df_dict = pd.read_csv('sso_dict.csv')
else:
    url_sso_dict = '''https://storage.googleapis.com/sa_saws_data/SAWS_SSO_DataFieldDescription_MM.xlsx'''
    df_dict = pd.read_excel(url_sso_dict)
    df_dict.to_csv('sso_dict.csv', index=False)

In [None]:
df_dict

In [None]:
# df = acquire_sso.acquire_sso()

In [None]:
# df.head(1)

In [None]:
# df = prepare_sso.prepare_sso_df()
# df.head(1)

In [None]:
# df = prepare.prepare_sso_df()
# print(df.shape[0])
# df.head(1)

In [None]:
# df.columns = ['sso_id','report_date','spill_address_num','spill_st_name',
#  'total_gal','gals_ret','spill_start','spill_stop','hrs','cause',
#  'comments','actions','watershed','unit_id','unit_id2','discharge_to',
#  'discharge_route','council_district','month','year','week',
#  'earz_zone','pipe_diam','pipe_len','pipe_type','inst_year','inches_no',
#  'rainfall_last3','spill_address_full','num_spills_recorded',
#  'num_spills_24mos','prevspill_24mos','unit_type','asset_type',
#  'last_cleaned','response_time','response_dttm','public_notice',
#  'root_cause','hrs_2','gal_2','hrs_3','gal_3','days_since_cleaned']

In [None]:
# df.head(1)

In [None]:
# df.ResponseTime * 60

In [None]:
# df.shape

In [None]:
# df_dict.shape

In [None]:
# df_dict['Data Description'].value_counts()

In [None]:
# unused = ['Disregard','Ignore','Service Req # (internal use only)',
#           'Not Used','Old mapping system reference (internal only)']
# df_dict[df_dict['Data Description'].isin(unused)]

In [None]:
# ready1_dict = df_dict[~df_dict['Data Description'].isin(unused)]
# print(len(ready1_dict))
# ready1_dict

In [None]:
# null_fields = ['SPILL_START_2','SPILL_START_3',
#                'SPILL_STOP_2','SPILL_STOP_3']
# final_dict = ready1_dict[~ready1_dict.Field.isin(null_fields)]\
#                     .reset_index(drop=True)
# len(final_dict)

In [None]:
# final_dict

In [None]:
# features_to_use = list(final_dict.Field)
# len(features_to_use)

In [None]:
# bad_features = list(df.columns[~df.columns.isin(features_to_use)])

In [None]:
# df = df.drop(columns=bad_features)
# df = df.drop(columns = ['TIMEINT','STEPS_TO_PREVENT'])
# df.shape

In [None]:
# string_features = ['SSO_ID','SPILL_ADDRESS','COUNCIL_DISTRICT',]
# for col in string_features:
#     df[col] = df[col].astype(str)
    
# time_features = ['REPORTDATE','SPILL_START','SPILL_STOP',
#                  'ResponseDTTM', 'LASTCLND']
# for col in time_features:
#     df[col] = pd.to_datetime(df[col])
    
# fill_features = ['NUM_SPILLS_24MOS','PREVSPILL_24MOS','HRS_2',
#                 'HRS_3','GAL_2','GAL_3']
# for col in fill_features:
#     df[col] = df[col].fillna(0)
    
# df.Root_Cause = df.Root_Cause.str.strip()

In [None]:
# df['days_since_cleaned'] = (df.SPILL_START - df.LASTCLND).dt.days

In [None]:
# df['country_address'] = df.spill_address_full + \
#                                   ',SAN ANTONIO,Texas,USA'
# df.country_address

In [None]:
# df[['country_address']].head(25)

In [None]:
# for i in range(df.index.max()//5):
#     print([1*i+5, 2*i+5, 3*i + 5])

In [None]:
# lmo = pd.DataFrame()
# lmo['loc'] = ['string','fnish']
# lmo['was'] = [2,3]
# lmo.was[1] = 4
# lmo

In [None]:
# locator = Nominatim(user_agent="myGeocoder")
# geocode = RateLimiter(locator.geocode, min_delay_seconds=.1)
# test2 = pd.Series()

# counter = 10
# for i in range(0, df.index.max(), 10):
#     test = df[['country_address']][i:counter]
#     test['location'] = test['country_address'].apply(geocode)
#     test['zip_code'] = 'None'
#     for t,l in enumerate(test.location):
#         if l is not None:
#             test['zip_code'][counter+t-10] = l.raw['display_name']\
#                                     .split(',')[-2]
#     time.sleep(2)
#     test2 = test2.append(test.zip_code)
#     counter+=10
# test2

### Working

In [None]:
# locator = Nominatim(user_agent="myGeocoder")
# geocode = RateLimiter(locator.geocode, min_delay_seconds=.1, 
#                      max_retries=10, error_wait_seconds=1)
# df['location'] = df['country_address'].apply(geocode)
# df['zip_code'] = 'None'

In [None]:
# for t,l in enumerate(df.location):
#         if l is not None:
#             df['zip_code'][t] = l.raw['display_name']\
#                                     .split(',')[-2]

In [None]:
# df = prepare.prepare_sso_with_zipcodes()
# print(df.shape[0])
# df.head(1)

# Exploration

In [None]:
df = prepare.get_data()
print(df.shape)
df.head(1)

In [None]:
df.root_cause.isna().sum()

In [None]:
df.root_cause.value_counts(dropna=False)

In [None]:
train, test = train_test_split(df, random_state=42, train_size=.8,
                              stratify=df.root_cause)

In [None]:
train.zip_code.value_counts().head(5)

In [None]:
train.isna().sum()

In [None]:
train.spill_address_full.head(15)

In [None]:
train.days_since_cleaned.dropna()

In [None]:
train[train.days_since_cleaned == train.days_since_cleaned.max()]

In [None]:
features = df.isna().sum() / df.shape[0]
bad_list = features[features > .4]
print(len(bad_list))
bad_list

In [None]:
train.drop(columns='hours_spilled')\
    .describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
len(train.report_date.unique())

In [None]:
train[train.num_spills_recorded == train.num_spills_recorded.max()]

In [None]:
train[train.total_gal.isin(list(train.total_gal.nlargest(8)))]

In [None]:
train.age_binned.dropna().value_counts().sort_index().plot.bar()
plt.xticks(rotation=45)

plt.ylabel('Count of binned age')
plt.xlabel('Binned age')
plt.title('Binned age of sewers')
plt.show()

In [None]:
ax = train.groupby('root_cause').days_since_cleaned.mean()\
    .sort_values(ascending=False).dropna().plot.barh()
ax.set_ylabel('')
ax.set_xlabel('Average days since cleaning')
plt.title('Average days since cleaning by root cause')
plt.show()

In [None]:
ax = train.groupby('root_cause').total_gal.median()\
    .sort_values(ascending=False).dropna().plot.barh()
ax.set_ylabel('')
ax.set_xlabel('Median gallons spilled')
plt.title('Median gallons spilled by root cause')
plt.show()

In [None]:
train['total_gal_binned'].value_counts().sort_index().plot.bar()
plt.xticks(rotation=45)

In [None]:
train[train.days_since_cleaned == train.days_since_cleaned.max()]

In [None]:
train[train.inst_year == train.inst_year.max()].shape

In [None]:
df.zip_code.value_counts()

In [None]:
df.inches_no.isna().sum()

In [None]:
df[df.age=='unknown'].shape

In [None]:
df.age = df.age.replace('unknown', 0)
df.age = df.age.replace(0, df.age.median())

In [None]:
df.age

In [None]:
df.pipe_len.isna().sum()

In [None]:
df.pipe_diam.isna().sum()

In [None]:
df.days_since_cleaned.median()

In [None]:
df[df.zip_code=='78245'].root_cause.value_counts()

In [None]:
root_cause_zipcode = df.groupby('zip_code').root_cause.apply(
        lambda x: x.value_counts().head(1))
# root_cause_zipcode.to_csv('root_cause_zipcode.csv')
root_cause_zipcode.sort_values().tail(40)

In [None]:
train.pipe_type.value_counts()

In [None]:
train.groupby('age_binned').total_gal.median()\
    .sort_values(ascending=False)

In [None]:
train.root_cause.value_counts()

In [None]:
train.head(1)

In [None]:
train[train.root_cause == 'other']['cause'].value_counts()

In [None]:
top_worst_spills = train[train.total_gal.isin(
                            train.total_gal.nlargest(15))]
top_worst_spills[['age_binned','total_gal']]\
    .sort_values('total_gal', ascending=False)

In [None]:
top_bins = top_worst_spills.sort_values(by='total_gal', 
                        ).age_binned.value_counts()
top_bins[top_bins>0]

In [None]:
train.groupby('age_binned').total_gal.mean()\
            .dropna().sort_values().tail(6)

In [None]:
ax = train.groupby('age_binned').total_gal.median()\
            .dropna().sort_values().tail(6).plot.barh()
xlabels = ['{:,.1f}'.format(x) +'K' 
                   for x in ax.get_xticks()/1000]
ax.set_xticklabels(xlabels)
ax.set_xlabel('Median gallons spilled')
ax.set_ylabel('Age binned')
plt.title('Median gallons spilled by binned age')
plt.show()

In [None]:
ax = train.groupby('age_binned').total_gal.mean()\
            .dropna().sort_values().tail(6).plot.barh()
xlabels = ['{:,.0f}'.format(x) +'K' 
                   for x in ax.get_xticks()/1000]
ax.set_xticklabels(xlabels)
ax.set_xlabel('Average gallons spilled')
ax.set_ylabel('Age binned')
plt.title('Average gallons spilled by binned age')
plt.show()

In [None]:
# num_spills_recorded, num_spills_24mos

In [None]:
sns.barplot(train.num_spills_recorded, train.root_cause)
plt.xlabel('Average number of spills')
plt.ylabel('')
plt.title('Average number of spill recorded by root cause of event')
plt.show()

In [None]:
# train.groupby('root_cause').num_spills_24mos.sum()
df.root_cause.value_counts()

In [None]:
sns.swarmplot(train.root_cause, train.num_spills_24mos)
plt.xlabel('Root cause')
plt.ylabel('Number of spills')
plt.title(
'Number of spills in last 24 months by root cause of event')
plt.show()

In [None]:
sns.barplot(train.num_spills_24mos, train.root_cause)
plt.xlabel('Average number of spills')
plt.ylabel('')
plt.title(
'Average number of spills in last 24 months by root cause of event')
plt.show()

In [None]:
df.to_csv('cleaned_df.csv')

## Make data useable for modeling

In [None]:
list(df.columns)

In [None]:
# df.inst_year = df.inst_year[df.inst_year!='unknown']

In [None]:
# df.last_cleaned.value_counts()

In [None]:
# unwanted = ['spill_st_name','comments','actions','discharge_to',
#            'discharge_route','spill_address_full','prevspill_24mos',
#            'last_cleaned','country_address','location']
# categorical_columns = [col for col in list(
#                 df.dtypes[df.dtypes=='O'].index) 
#                  if col not in unwanted]

In [None]:
columns_to_drop_from_model = [
    "sso_id","report_date","spill_address_num","spill_st_name",
    "spill_stop","spill_start","cause","comments","actions",
    "month","year","week","spill_address_full","last_cleaned",
    "response_dttm","prevspill_24mos","public_notice",
    "country_address","location","inches_no","rainfall_last3",
    "unit_id","unit_id2","zip_code","discharge_to","discharge_route",
    "council_district","hours_spilled","hrs","gals_ret",
    "response_time"
]

df2 = df.copy().drop(columns=columns_to_drop_from_model)
print(df2.shape)
df2.head(1)

In [None]:

categorical_columns = [
    "watershed",
    "earz_zone",
    "pipe_type",
    "inst_year",
    "unit_type",
    "asset_type",
    "age_binned",
    "total_gal_binned"
    
]

for column in categorical_columns:
    if is_numeric_dtype(df2[f"{column}"]):
        values = df2[f"{column}"].unique()
        for value in values:
            df2[f"{column}_is_{value}"]=(df2[f"{column}"] == value)\
                    .astype(int)
        df2 = df2.drop(columns=column)
    elif is_string_dtype(df2[f"{column}"]):
        values = df2[f"{column}"].astype(str).str.lower().unique()
        for value in values:
            df2[f"{column}_is_{value}"]=(df2[f"{column}"] == value)\
                .astype(int)
        df2 = df2.drop(columns=column)

In [None]:
df2.head(1)

In [None]:
df2.root_cause.value_counts(dropna=False)

In [None]:
# other_causes = ['vandalism','roots','i/i','by pass pump leak']
# for cause in other_causes:
#     df2.root_cause = df2.root_cause.replace(cause, 'other')
# df2.root_cause.value_counts()

In [None]:
X = df2.drop(columns='root_cause')
y = df2.root_cause

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=.8, random_state=13, 
                    stratify=y)
X_train, X_val, y_train, y_val = \
    train_test_split(X_train, y_train, train_size=.75,
                     random_state=13, stratify=y_train)

### Decision Tree

In [None]:
params = {'max_depth': range(1,13), 'min_samples_leaf': range(1,7),
          'criterion': ['gini', 'entropy']}
model = DecisionTreeClassifier(random_state=13)
grid = GridSearchCV(model, params, cv=None, iid=True)
grid.fit(X_train, y_train)
print(grid.best_params_)

In [None]:
results = grid.cv_results_
for score, p in zip(results['mean_test_score'], results['params']):
    p['score'] = score
pd.DataFrame(results['params']).sort_values('score').tail(5)

In [None]:
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4,
                              criterion='entropy',random_state=13)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_train)
print(f'Accuracy Score on Train: \
    {tree.score(X_train, y_train) * 100:.2f}%')
print(f'Accuracy Score on Val: \
    {tree.score(X_val, y_val) * 100:.2f}%')

In [None]:
pd.DataFrame(classification_report(y_train, y_pred,
                                   output_dict=True)).T

68% on 742 points

80% on 241 points

88% on 84  points

39% on 367 points

00% on 266 points

In [None]:
print(pd.Series([.68,.80,.88]).mean())
print((742+241+84)/1909)

In [None]:
print(pd.Series([.68,.80,.88,.39]).mean())
print((742+241+84+367)/1909)

In [None]:
print(pd.Series([.68,.80,.88,.39,.00]).mean())
print((742+241+84+367+266)/1909)

In [None]:
print(f'Accuracy Score on Test: \
    {tree.score(X_test, y_test) * 100:.2f}%')

### Logistic Regression

In [None]:
model = LogisticRegression(random_state = 13).fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred_prob = model.predict_proba(X_train)

print(f'Accuracy Score on Train: \
    {model.score(X_train, y_train) * 100:.2f}%')
print(f'Accuracy Score on Val: \
    {model.score(X_val, y_val) * 100:.2f}%')

In [None]:
pd.DataFrame(classification_report(y_train, y_pred,
                                   output_dict=True)).T

### Random Forest

In [None]:
params = {'max_depth': range(1,13), 'min_samples_leaf': range(1,4),}
          #'criterion': ['gini', 'entropy']}
model = RandomForestClassifier(random_state=13)
grid = GridSearchCV(model, params, cv=None, iid=True)
grid.fit(X_train, y_train)
print(grid.best_params_)

In [None]:
results = grid.cv_results_
for score, p in zip(results['mean_test_score'], results['params']):
    p['score'] = score
pd.DataFrame(results['params']).sort_values('score').tail(5)

In [None]:
# forest = RandomForestClassifier(random_state=42, 
#     min_samples_leaf=1, max_depth=10, 
#     criterion='entropy').fit(X_train, y_train)
forest = RandomForestClassifier(random_state=13, min_samples_leaf=2,
                               max_depth=8).fit(X_train, y_train)
y_pred = forest.predict(X_train)
print(f'Accuracy Score on Train: \
    {forest.score(X_train, y_train) * 100:.2f}%')
print(f'Accuracy Score on Val: \
    {forest.score(X_val, y_val) * 100:.2f}%')

In [None]:
pd.DataFrame(classification_report(y_train, y_pred,
                                   output_dict=True)).T

### KNeighbors Classifier

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), 
                        columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), 
                        columns=X_test.columns, index=X_test.index)

In [None]:
params = {'n_neighbors': range(1,13), 'p': range(1,6),}
#           'weights': ['uniform', 'distance']}
model = KNeighborsClassifier()
grid = GridSearchCV(model, params, cv=4, iid=True)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)

In [None]:
results = grid.cv_results_
for score, p in zip(results['mean_test_score'], results['params']):
    p['score'] = score
pd.DataFrame(results['params']).sort_values('score').tail(5)

In [None]:
knmodel = KNeighborsClassifier(n_neighbors=12, p=1, 
                ).fit(X_train_scaled, y_train)
y_pred = knmodel.predict(X_train_scaled)

print(f'Accuracy Score on Train: \
    {knmodel.score(X_train_scaled, y_train) * 100:.2f}%')
print(f'Accuracy Score on Val: \
    {knmodel.score(X_val_scaled, y_val) * 100:.2f}%')

In [None]:
pd.DataFrame(classification_report(y_train, y_pred,
                                   output_dict=True)).T

68.2% on 742

80.0% on 241

65.7% on 84

40.7% on 367

In [None]:
print(pd.Series([.682,.80,.657,.407]).mean())
print((742+241+84+367)/1909)