In [21]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import ssl
ssl._create_default_https_context= ssl._create_unverified_context

In [2]:
chicago = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Current_Employee_Names__Salaries__and_Position_Titles.csv')

In [3]:
chicago.head()
chicago.shape

(32658, 8)

In [4]:
salary_df= chicago.loc[chicago['Salary or Hourly']=='Salary']
salary_df

Unnamed: 0,Name,Job Titles,Department,Full or Part-Time,Salary or Hourly,Typical Hours,Annual Salary,Hourly Rate
0,"ALLISON, PAUL W",LIEUTENANT,FIRE,F,Salary,,$107790.00,
1,"BRUNO, KEVIN D",SERGEANT,POLICE,F,Salary,,$104628.00,
2,"COOPER, JOHN E",LIEUTENANT-EMT,FIRE,F,Salary,,$114324.00,
3,"CRESPO, VILMA I",STAFF ASST,LAW,F,Salary,,$76932.00,
4,"DOLAN, ROBERT J",SERGEANT,POLICE,F,Salary,,$111474.00,
...,...,...,...,...,...,...,...,...
32653,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,F,Salary,,$90024.00,
32654,"ZYLINSKA, KATARZYNA",POLICE OFFICER,POLICE,F,Salary,,$48078.00,
32655,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,F,Salary,,$87006.00,
32656,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,F,Salary,,$93354.00,


In [5]:
job_titles= salary_df['Job Titles'].nunique()
department_titles= salary_df['Department'].nunique()

salary_df['Job Titles'].value_counts().head(30)

POLICE OFFICER                                      9393
FIREFIGHTER-EMT                                     1424
SERGEANT                                            1118
POLICE OFFICER (ASSIGNED AS DETECTIVE)               845
FIREFIGHTER                                          564
LIEUTENANT-EMT                                       398
LIEUTENANT                                           356
FIREFIGHTER-EMT (RECRUIT)                            319
PARAMEDIC I/C                                        291
FIREFIGHTER/PARAMEDIC                                278
PARAMEDIC                                            252
AVIATION SECURITY OFFICER                            251
POLICE COMMUNICATIONS OPERATOR I                     245
POLICE COMMUNICATIONS OPERATOR II                    227
FIRE ENGINEER-EMT                                    226
DETENTION AIDE                                       226
ASST CORPORATION COUNSEL                             136
SENIOR DATA ENTRY OPERATOR     

In [6]:
def one_hot_encode(X, encode_cols, index=None):
    other_cols = [c for c in X.columns if c not in encode_cols]

    ct = ColumnTransformer(
        #   Format
        #   [("name of step", what_to_do(), [what columns to do it to])]
        [("one hot encode", OneHotEncoder(drop="first", sparse=False), encode_cols)],
        remainder="passthrough",
    )

    ct.fit(X)

    # This is not the flexible and definitely not the most
    # readable way to get column names, a function would be better
    encoded_names = ct.transformers_[0][1].get_feature_names()
    encoded_names = list(encoded_names)

    X_encoded = ct.transform(X)
    X_encoded = pd.DataFrame(X_encoded, columns=encoded_names + other_cols, index=index)

    return X_encoded

In [7]:
salary_df= one_hot_encode(salary_df, encode_cols= ['Department'], index= salary_df.index)

In [8]:
salary_df.columns

Index(['x0_ANIMAL CONTRL', 'x0_AVIATION', 'x0_BOARD OF ELECTION',
       'x0_BOARD OF ETHICS', 'x0_BUDGET & MGMT', 'x0_BUILDINGS',
       'x0_BUSINESS AFFAIRS', 'x0_CITY CLERK', 'x0_CITY COUNCIL',
       'x0_COMMUNITY DEVELOPMENT', 'x0_COPA', 'x0_CULTURAL AFFAIRS',
       'x0_DISABILITIES', 'x0_DoIT', 'x0_FAMILY & SUPPORT', 'x0_FINANCE',
       'x0_FIRE', 'x0_GENERAL SERVICES', 'x0_HEALTH', 'x0_HUMAN RELATIONS',
       'x0_HUMAN RESOURCES', 'x0_INSPECTOR GEN', 'x0_IPRA', 'x0_LAW',
       'x0_LICENSE APPL COMM', 'x0_MAYOR'S OFFICE', 'x0_OEMC', 'x0_POLICE',
       'x0_POLICE BOARD', 'x0_PROCUREMENT', 'x0_PUBLIC LIBRARY',
       'x0_STREETS & SAN', 'x0_TRANSPORTN', 'x0_TREASURER', 'x0_WATER MGMNT',
       'Name', 'Job Titles', 'Full or Part-Time', 'Salary or Hourly',
       'Typical Hours', 'Annual Salary', 'Hourly Rate'],
      dtype='object')

In [9]:
salary_df= salary_df.drop(columns=['Name', 'Job Titles', 'Full or Part-Time', 'Salary or Hourly',
       'Typical Hours', 'Hourly Rate'])

In [10]:
salary_df.columns

Index(['x0_ANIMAL CONTRL', 'x0_AVIATION', 'x0_BOARD OF ELECTION',
       'x0_BOARD OF ETHICS', 'x0_BUDGET & MGMT', 'x0_BUILDINGS',
       'x0_BUSINESS AFFAIRS', 'x0_CITY CLERK', 'x0_CITY COUNCIL',
       'x0_COMMUNITY DEVELOPMENT', 'x0_COPA', 'x0_CULTURAL AFFAIRS',
       'x0_DISABILITIES', 'x0_DoIT', 'x0_FAMILY & SUPPORT', 'x0_FINANCE',
       'x0_FIRE', 'x0_GENERAL SERVICES', 'x0_HEALTH', 'x0_HUMAN RELATIONS',
       'x0_HUMAN RESOURCES', 'x0_INSPECTOR GEN', 'x0_IPRA', 'x0_LAW',
       'x0_LICENSE APPL COMM', 'x0_MAYOR'S OFFICE', 'x0_OEMC', 'x0_POLICE',
       'x0_POLICE BOARD', 'x0_PROCUREMENT', 'x0_PUBLIC LIBRARY',
       'x0_STREETS & SAN', 'x0_TRANSPORTN', 'x0_TREASURER', 'x0_WATER MGMNT',
       'Annual Salary'],
      dtype='object')

In [11]:
salary_df['Annual Salary']= salary_df['Annual Salary'].str.strip('$')

In [12]:
for col in salary_df.columns:
    salary_df[col]= pd.to_numeric(salary_df[col])

In [13]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24775 entries, 0 to 32657
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   x0_ANIMAL CONTRL          24775 non-null  float64
 1   x0_AVIATION               24775 non-null  float64
 2   x0_BOARD OF ELECTION      24775 non-null  float64
 3   x0_BOARD OF ETHICS        24775 non-null  float64
 4   x0_BUDGET & MGMT          24775 non-null  float64
 5   x0_BUILDINGS              24775 non-null  float64
 6   x0_BUSINESS AFFAIRS       24775 non-null  float64
 7   x0_CITY CLERK             24775 non-null  float64
 8   x0_CITY COUNCIL           24775 non-null  float64
 9   x0_COMMUNITY DEVELOPMENT  24775 non-null  float64
 10  x0_COPA                   24775 non-null  float64
 11  x0_CULTURAL AFFAIRS       24775 non-null  float64
 12  x0_DISABILITIES           24775 non-null  float64
 13  x0_DoIT                   24775 non-null  float64
 14  x0_FAM

In [14]:
X= salary_df.drop(columns=['Annual Salary'])
y= salary_df['Annual Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
grid= {'max_depth': [20,30,40,50],'max_leaf_nodes': [30,40,50,60],'min_samples_leaf':[1,3,5]}
model= GridSearchCV(DecisionTreeRegressor(), grid, verbose=1)
model.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:    7.6s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [20, 30, 40, 50],
                         'max_leaf_nodes': [30, 40, 50, 60],
                         'min_sampl

In [18]:
model.best_estimator_.min_samples_leaf

1

In [19]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.14226037540462222
test_score: 0.15621225953167028


In [28]:
grid = {
    'n_estimators': [150,500,1000],
    'max_depth': [50,100,150], 
    'min_samples_leaf': [.01,.1,1],
}

model = GridSearchCV(RandomForestRegressor(), grid, verbose=1, cv=2)
model.fit(X_train, y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  3.7min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [29]:
model.best_params_

{'max_depth': 100, 'min_samples_leaf': 1, 'n_estimators': 1000}

In [30]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.14233249465346576
test_score: 0.1564883871611945


In [25]:
dir(model)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score