In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR

## Task

Build a programm which

1. Reads provided data.

In [35]:
data = pd.read_csv("DR_Demo_Fire_Ins_Loss_only.csv")
X = data.drop(columns='loss')
y = data.loss

2. Does exploratory data analysis.

In [36]:
data.describe()

Unnamed: 0,loss,Exposure,Property_size,Residents,Norm_fire_risk,Norm_monthly_rent,Loan_mortgage,No_claim_Years,Previous_claims,Norm_area_m,...,crime_residents,crime_area,crime_arson,crime_burglary,crime_neighbour_watch,crime_community,crime_risk,Geographical_risk,Weather_risk,ISO
count,1217.0,210.0,1216.0,1212.0,1217.0,1217.0,196.0,1217.0,353.0,1047.0,...,910.0,923.0,916.0,922.0,923.0,903.0,923.0,1211.0,1213.0,1217.0
mean,0.080477,2.809524,3.820724,2.514026,4.040762,1852.694145,3.286935,1.01064,0.320084,34.265521,...,0.958211,0.989317,0.954384,1.044832,0.982719,0.962596,0.992618,1.125001,0.918192,2.889071
std,0.050425,0.726731,1.598589,1.318972,0.232445,1678.089181,0.243295,0.769526,0.50096,25.261731,...,0.403047,0.407156,0.238484,0.523383,0.490376,0.254882,0.358018,1.918566,1.09444,1.598731
min,0.01,1.0,1.0,1.0,3.77648,500.0,2.673639,0.0,0.0,0.0,...,0.360578,0.06552,0.472193,0.154595,0.0,0.522494,0.175763,0.0,0.0,1.0
25%,0.04,2.0,3.0,1.0,3.898949,783.693499,3.144307,0.0,0.0,16.0,...,0.625186,0.687961,0.739102,0.572656,0.686813,0.739831,0.76778,0.091752,0.273559,2.0
50%,0.07,3.0,3.5,2.0,3.898949,1232.882801,3.339025,1.048147,0.0,30.0,...,0.903848,0.924562,0.94269,1.019386,0.989819,0.924495,0.974872,0.392954,0.550059,3.0
75%,0.1,3.0,5.0,4.0,4.171214,2248.569323,3.474623,1.665109,0.832555,47.0,...,1.210431,1.190282,1.130489,1.32265,1.191823,1.11043,1.19302,1.059832,1.204226,4.0
max,0.2,5.0,8.0,6.0,5.123351,13296.24007,3.714656,2.639501,1.893018,208.0,...,2.970605,4.145968,1.926021,2.391149,4.322882,2.043525,2.641966,8.548033,9.098934,6.0


In [37]:
data.isna().sum()

loss                        0
Exposure                 1007
Rating_Class               87
Sub_Rating_Class         1212
Renewal_class            1093
Sub_Renewal_Class        1114
Property_size               1
Residents                   5
Commercial                219
Norm_fire_risk              0
Norm_monthly_rent           0
Loan_mortgage            1021
No_claim_Years              0
Previous_claims           864
Norm_area_m               170
Premium_remain           1033
Premium_renew               0
Renewal_Type                0
crime_property_type       294
crime_residents           307
crime_area                294
crime_arson               301
crime_burglary            295
crime_neighbour_watch     294
crime_community           314
crime_risk                294
Geographical_risk           6
Weather_risk                4
ISO                         0
ISO_cat                     0
ISO_desc                    0
dtype: int64

In [38]:
data.dtypes

loss                     float64
Exposure                 float64
Rating_Class              object
Sub_Rating_Class          object
Renewal_class             object
Sub_Renewal_Class         object
Property_size            float64
Residents                float64
Commercial                object
Norm_fire_risk           float64
Norm_monthly_rent        float64
Loan_mortgage            float64
No_claim_Years           float64
Previous_claims          float64
Norm_area_m              float64
Premium_remain           float64
Premium_renew            float64
Renewal_Type              object
crime_property_type      float64
crime_residents          float64
crime_area               float64
crime_arson              float64
crime_burglary           float64
crime_neighbour_watch    float64
crime_community          float64
crime_risk               float64
Geographical_risk        float64
Weather_risk             float64
ISO                        int64
ISO_cat                   object
ISO_desc  

3. Does feature enginering.

In [39]:
data = data.drop(columns=data.columns[data.isna().sum() > len(data)/1.5])

In [40]:
numeric_features = list(data.select_dtypes(["float", 'int']).drop(columns='loss'))
categorical_features = list(data.select_dtypes(["object"]).drop(columns='ISO_desc'))
text_features = ['ISO_desc']

4. Does CV partitioning (keep 10-20% for holdout).

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['loss']), data.loss, test_size=0.2, random_state=42)

y_train = y_train.values
y_test = y_test.values

In [42]:
print('train data size: ',len(X_train))
print('test data size: ',len(X_test))

train data size:  973
test data size:  244


5. Builds a pipeline which can train model.

In [43]:
class StackingTransformer(BaseEstimator, TransformerMixin):
    """A transformer that wraps a RegressorMixin `est` predict function.
    
    It overloads `fit_transform` to do a stacked transform to avoid leakage.
    """
    def __init__(self, est):
        self.est = est
        
    def fit(self, X, y=None):
        if y is None:
            raise ValueError('{}.fit requires y to be not None'.format(self))
        self.est.fit(X, y)
        return self
    
    def transform(self, X):
        return self.est.predict(X)[:, np.newaxis]
    
    def fit_transform(self, X, y=None):
        if y is None:
            raise ValueError('{}.fit requires y to be not None'.format(self))
        self.models_ = []
        out = np.empty_like(y)[:, np.newaxis]
        for train, test in KFold(5, shuffle=True, random_state=0).split(X):
            self.models_.append(clone(self.est).fit(X[train], y[train]))
            out[test, 0] = self.models_[-1].predict(X[test])
        self.est = self.models_[0]
        return out


numeric_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer()),
#     ('scaler', StandardScaler())
    ])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
#     ('fx_selection', SelectFromModel(ElasticNet(alpha=0.1), max_features=40, threshold=None))
])

text_pipeline = Pipeline(steps=[
    ('vect', CountVectorizer()),
    ('vect_reg', StackingTransformer(ElasticNet(alpha=0.1))),
    ('tfidf', TfidfTransformer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
        ('text', text_pipeline, text_features[0])
    ])

models = {
    'LR' : LinearRegression(),
    'RF' : RandomForestRegressor(),
    'Ridge' : Ridge(),
    'SVM_rbf' : SVR(kernel='rbf'),
}

pipelines = {}

kf = KFold(n_splits=4, shuffle=True)

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      (name, model)])
    pipe.fit(X_train, y_train)
    pipelines[name] = pipe
#     print("%s model cv NMSE score: %.3f" % 
#           (name, cross_val_score(pipe, X_test, y_test, scoring = 'neg_mean_absolute_error', cv=kf).mean()))
    print("%s model MSE score: %.3f" % (name, mean_absolute_error(y_test, pipe.predict(X_test))))

LR model MSE score: 0.041
RF model MSE score: 0.036
Ridge model MSE score: 0.041
SVM_rbf model MSE score: 0.052


6. Optimizes hyperparameters of models. Use GridSearch, RandomSearch or Bayesian optimization.

In [44]:
import sys
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'RF' : {
            'RF__n_estimators': np.arange(10, 100, 5).astype(int),
            'RF__max_depth': np.arange(5, 50, 5).astype(int),
            'RF__max_features': ['auto', 'sqrt'], 
    },
    'Ridge' : {'Ridge__alpha': np.arange(0.1, 1.1, 0.1)},
    'SVM_rbf': {'SVM_rbf__gamma': ['scale', 'auto'],
               'SVM_rbf__C': np.arange(0.1, 1.1, 0.1),
                'SVM_rbf__epsilon': np.arange(0.01, 0.2, 0.01),},
}

result_pipelines = {}

for name, pipe in pipelines.items():
    if name in parameters.keys():
        rs = RandomizedSearchCV(pipe, parameters[name], cv=kf, n_iter=150, 
                                verbose=1, n_jobs=-1)
        rs.fit(X_train, y_train)
        result_pipelines[name] = pipe.set_params(**rs.best_params_)
        print(f"{name}: best parameters: {rs.best_params_}")
    else:
        result_pipelines[name] = pipe

Fitting 4 folds for each of 150 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   30.4s finished


RF: best parameters: {'RF__n_estimators': 85, 'RF__max_features': 'auto', 'RF__max_depth': 5}
Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.4s finished


Ridge: best parameters: {'Ridge__alpha': 1.0}
Fitting 4 folds for each of 150 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   11.7s


SVM_rbf: best parameters: {'SVM_rbf__gamma': 'scale', 'SVM_rbf__epsilon': 0.05, 'SVM_rbf__C': 0.30000000000000004}


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   15.9s finished


7. Reports scores of models.

In [47]:
for name, model in result_pipelines.items():
    print("%s model MSE score: %.3f" % (name, mean_absolute_error(y_test, model.predict(X_test))))

LR model MSE score: 0.041
RF model MSE score: 0.036
Ridge model MSE score: 0.041
SVM_rbf model MSE score: 0.052
