In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class CustomNumberTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X+100

In [3]:
class CustomCategoricalTransformer(BaseEstimator, TransformerMixin ):

    def __init__(self):
        return None

    def fit( self, X, y=None):
        return self

    def modify(self, obj):
        return obj[0]

    def transform(self, X, y=None ):
        for col in X.columns:
            X.loc[:,col] = X.loc[:,col].apply(self.modify)

        return X.values 

In [4]:
df = pd.read_csv('bad-drivers.csv')
print(df.shape)
df.head()

(51, 8)


Unnamed: 0,State,Number of drivers involved in fatal collisions per billion miles,Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding,Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired,Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted,Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents,Car Insurance Premiums ($),Losses incurred by insurance companies for collisions per insured driver ($)
0,Alabama,18.8,39,30,96,80,784.55,145.08
1,Alaska,18.1,41,25,90,94,1053.48,133.93
2,Arizona,18.6,35,28,84,96,899.47,110.35
3,Arkansas,22.4,18,26,94,95,827.34,142.39
4,California,12.0,35,28,91,89,878.41,165.63


In [5]:
X = df.drop(['Car Insurance Premiums ($)'], axis=1)
y = df['Car Insurance Premiums ($)']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('custom', CustomNumberTransformer())])

In [8]:
categorical_transformer = Pipeline(steps=[
    ('custom', CustomCategoricalTransformer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_features

Index(['Number of drivers involved in fatal collisions per billion miles',
       'Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding',
       'Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired',
       'Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted',
       'Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents',
       'Losses incurred by insurance companies for collisions per insured driver ($)'],
      dtype='object')

In [10]:
categorical_features = X.select_dtypes(include=['object']).columns
categorical_features

Index(['State'], dtype='object')

In [11]:
q = numeric_transformer.fit_transform(X_train[numeric_features])
q

array([[100.80060642, 100.00258856, 100.51222861, 100.79862454,
         99.56240503,  98.44162943],
       [ 99.7631035 ,  98.13882279, 100.91795424, 100.4966871 ,
        100.14586499,  99.1584751 ],
       [ 99.57865853,  99.69196094, 100.71509142, 100.85901203,
         98.97894506,  99.90527254],
       [101.83810935,  99.17424822, 102.33799395, 100.85901203,
         99.56240503,  98.95480307],
       [100.06282657, 100.72738636,  99.90364016, 100.13436216,
        101.45864991, 100.62251756],
       [ 99.67088102,  99.38133331,  99.70077735, 100.61746208,
        100.87518995,  98.92285452],
       [ 99.25587985,  98.76007805,  99.29505172, 100.13436216,
         99.85413501, 100.71197351],
       [101.05421825, 100.41675873, 100.51222861,  99.28893732,
        101.31278492, 102.35173302],
       [101.81505373, 100.31321619,  99.49791453, 100.73823705,
         99.70827002, 100.66564811],
       [100.82366204,  98.96716314,  99.70077735,  99.83242472,
         98.83308007, 100.7

In [12]:
q = categorical_transformer.fit_transform(X_train[categorical_features])
q.toarray()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [13]:
preprocessor = ColumnTransformer(transformers=[
                                                ('num', numeric_transformer, numeric_features),
                                                ('cat', categorical_transformer, categorical_features)
                                              ]
                                )
q = preprocessor.fit_transform(X_train)
q[0]

array([100.80060642, 100.00258856, 100.51222861, 100.79862454,
        99.56240503,  98.44162943,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

In [14]:
rf = Pipeline(steps=[
                     ('preprocessor', preprocessor),
                     ('classifier', RandomForestRegressor())
                    ])

In [15]:
rf.fit(X_train, y_train);



In [16]:
y_pred = rf.predict(X_test)
y_pred

array([ 786.553,  798.082,  973.727,  954.121,  777.589,  875.076,
        744.781, 1128.363,  703.579, 1187.351,  748.905])

In [17]:
y_test.values

array([ 746.54,  861.18,  913.15, 1110.61,  899.47, 1029.87,  716.2 ,
       1234.31,  816.21,  878.41,  708.24])

In [18]:
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8]}

In [19]:
CV = GridSearchCV(rf, param_grid, n_jobs= -2)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)



{'classifier__max_depth': 8, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 200}
0.25939753607766847




# -------------------- TESTING -----------------------

In [20]:
df = pd.DataFrame({'col_1': [1,2,3,4,5], 'col_2': ['a B', 'c D', 'e F', 'g H', 'i J'], 'col_3': [10,11,12,13,14]})
df

Unnamed: 0,col_1,col_2,col_3
0,1,a B,10
1,2,c D,11
2,3,e F,12
3,4,g H,13
4,5,i J,14
