In [1]:
# !pip install numpy matplotlib pandas sklearn

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

## 1. Reads provided data

In [3]:
raw_data = pd.read_csv('https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Fire_Ins_Loss_only.csv')
raw_data.shape

(1217, 31)

## 2. Does exploratory data analysis

In [4]:
numeric_data = raw_data._get_numeric_data()
numeric_data.shape

(1217, 23)

In [5]:
numeric_data.head()

Unnamed: 0,loss,Exposure,Property_size,Residents,Norm_fire_risk,Norm_monthly_rent,Loan_mortgage,No_claim_Years,Previous_claims,Norm_area_m,Premium_remain,Premium_renew,crime_property_type,crime_residents,crime_area,crime_arson,crime_burglary,crime_neighbour_watch,crime_community,crime_risk,Geographical_risk,Weather_risk,ISO
0,0.07,3.0,3.0,4.0,4.314082,758.287544,2.673639,2.251423,0.0,10.0,,0.0,1.209807,1.064398,1.066522,0.939346,1.708534,0.808015,1.040819,1.171366,0.284447,0.313157,5
1,0.13,,6.0,1.0,4.171214,1060.660172,2.709158,2.08418,,,,86.666667,,,,,,,,,0.006305,0.889677,6
2,0.02,,4.0,1.0,4.171214,1060.660172,2.709158,2.08418,,8.0,,86.666667,,,,,,,,,0.019372,0.276507,1
3,0.13,,2.0,1.0,4.171214,1060.660172,2.709158,2.08418,,12.0,,86.666667,2.304699,1.912726,1.463283,1.632539,1.205554,1.999838,1.665336,1.779279,0.085413,0.124679,1
4,0.04,3.0,4.0,3.0,4.314082,716.709146,2.711775,2.408759,0.0,,100.0,0.0,0.669145,0.430252,0.345801,0.638615,0.567431,0.751454,0.585363,0.729587,5.57083,0.703054,6


All the columns make sense in data analysis (there are no such fields like ID).

In [6]:
categorical_data = raw_data.select_dtypes(include=["object"])
categorical_data.head()

Unnamed: 0,Rating_Class,Sub_Rating_Class,Renewal_class,Sub_Renewal_Class,Commercial,Renewal_Type,ISO_cat,ISO_desc
0,H1,,,,,B,ISO 5 - Modified or Semi Fire Resistive (MFR o...,which and floors. greater roof _—– modified an...
1,M1,,,,,B,ISO 6 - Fire Resistive (FR),bearing are walls deck/cover precast and/or co...
2,M1,,,,,B,ISO 1 Frame (combustible walls or roof),*BUR 1B and covers the *Single-ply wood/hardip...
3,M1,,,,,B,ISO 1 Frame (combustible walls or roof),"frame anchorage are wood. *Less Habitational, ..."
4,H1,,,,,B,ISO 6 - Fire Resistive (FR),is roof deck structural on roof - poured occup...


In [7]:
categorical_data.head()

Unnamed: 0,Rating_Class,Sub_Rating_Class,Renewal_class,Sub_Renewal_Class,Commercial,Renewal_Type,ISO_cat,ISO_desc
0,H1,,,,,B,ISO 5 - Modified or Semi Fire Resistive (MFR o...,which and floors. greater roof _—– modified an...
1,M1,,,,,B,ISO 6 - Fire Resistive (FR),bearing are walls deck/cover precast and/or co...
2,M1,,,,,B,ISO 1 Frame (combustible walls or roof),*BUR 1B and covers the *Single-ply wood/hardip...
3,M1,,,,,B,ISO 1 Frame (combustible walls or roof),"frame anchorage are wood. *Less Habitational, ..."
4,H1,,,,,B,ISO 6 - Fire Resistive (FR),is roof deck structural on roof - poured occup...


We can see that ISO_desc is not a categorical value.

In [8]:
text_data = categorical_data['ISO_desc']
categorical_data = categorical_data.drop('ISO_desc', axis=1)
text_data.shape

(1217,)

In [9]:
text_data.head()

0    which and floors. greater roof _—– modified an...
1    bearing are walls deck/cover precast and/or co...
2    *BUR 1B and covers the *Single-ply wood/hardip...
3    frame anchorage are wood. *Less Habitational, ...
4    is roof deck structural on roof - poured occup...
Name: ISO_desc, dtype: object

## 4. Does CV partitioning (keep 10-20% for holdout)

In [10]:
loss = numeric_data['loss']

features = raw_data.drop(['loss', 'ISO_desc'], axis=1)
numeric_data = numeric_data.drop('loss', axis=1)

In [11]:
X_train, X_test, loss_train, loss_test = train_test_split(features, loss, test_size=0.1, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(loss_train.shape)
print(loss_test.shape)

(1095, 29)
(122, 29)
(1095,)
(122,)


## 5. Builds a pipeline which can train model

In [12]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [13]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_data.columns),
        ('cat', categorical_transformer, categorical_data.columns)
    ])

In [15]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestRegressor())])

In [16]:
rf.fit(X_train, loss_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Exposure', 'Property_size', 'Residents', 'Norm_fire_risk',
       'Norm_monthly_rent', 'Loan_mortgage', 'No_claim_Years',
       'Previous_claims', 'Norm_area_m', 'Premium_remain', 'Premium_renew',
       'crime_property_type', 'c...
       'crime_risk', 'Geographical_risk', 'Weather_risk', 'ISO'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                 

## 5. Optimizes hyperparameters of models

In [17]:
tuned_parameters = {
    'cls__n_estimators': [50, 100, 300, 500], 
    'cls__max_depth': [None, 1, 3, 5], 
    'cls__min_samples_split': [1, 2, 3]}

lr2 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('cls', RandomForestRegressor())])

clf = GridSearchCV(lr2, tuned_parameters, n_jobs=-1, verbose=10)
clf.fit(X_train, loss_train)
print(clf.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 234 out of 240 | elapsed:   32.6s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 240 out 

{'cls__max_depth': 3, 'cls__min_samples_split': 3, 'cls__n_estimators': 300}


## 6. Reports scores of models.

In [19]:
print("Initial score: ", mean_squared_error(rf.predict(X_test), loss_test))
print("Tuned score: ", mean_squared_error(clf.predict(X_test), loss_test))

Initial score:  0.0018996459016393445
Tuned score:  0.0018566111220825593
