In [1]:
import pandas as pd
import scipy.stats as st
import numpy as np

In [2]:
categorical_df = pd.read_csv('categorical.csv')
categorical_df.head()

Unnamed: 0,state,response,coverage,education,employment_status,gender,location_code,marital_status,renew_offer_type,sales_channel,vehicle_class,vehicle_size,policy_category
0,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Offer1,Agent,Two-Door Car,Medsize,Corporate
1,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Offer3,Agent,Four-Door Car,Medsize,Personal
2,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Offer1,Agent,Two-Door Car,Medsize,Personal
3,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Offer1,Call Center,SUV,Medsize,Corporate
4,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Offer1,Agent,Four-Door Car,Medsize,Personal


In [6]:
numerical_df = pd.read_csv('numerical.csv')
numerical_df.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,month_year,day_year,year_year
0,2763.519279,56274,69,32,5,0,1,384.811147,2,24,2011
1,6979.535903,0,94,13,42,0,8,1131.464935,1,31,2011
2,12887.43165,48767,108,18,38,0,2,566.472247,2,19,2011
3,7645.861827,0,106,18,65,0,7,529.881344,1,20,2011
4,2813.692575,43836,73,12,44,0,1,138.130879,2,3,2011


In [8]:
data = pd.concat([categorical_df,numerical_df],axis=1)
data.head()

Unnamed: 0,state,response,coverage,education,employment_status,gender,location_code,marital_status,renew_offer_type,sales_channel,...,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,month_year,day_year,year_year
0,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Offer1,Agent,...,56274,69,32,5,0,1,384.811147,2,24,2011
1,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Offer3,Agent,...,0,94,13,42,0,8,1131.464935,1,31,2011
2,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Offer1,Agent,...,48767,108,18,38,0,2,566.472247,2,19,2011
3,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Offer1,Call Center,...,0,106,18,65,0,7,529.881344,1,20,2011
4,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Offer1,Agent,...,43836,73,12,44,0,1,138.130879,2,3,2011


In [12]:
data.shape

(9134, 24)

# Doing the X and Y split

In [9]:
X = data.drop(['total_claim_amount'],axis=1)
y = data['total_claim_amount']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [11]:
X_train.shape

(4567, 23)

In [13]:
numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)
numericals_train.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month_year,day_year,year_year
4226,5970.816921,25125,81,12,91,1,3,2,5,2011
6100,4799.621411,0,65,11,25,0,4,1,24,2011
3414,3539.97159,0,99,13,87,0,1,1,10,2011
1540,6871.966648,0,196,5,59,2,1,2,25,2011
1367,8024.992989,38702,101,7,32,3,3,2,12,2011


In [14]:
y_train.head()

4226     583.200000
6100     312.000000
3414    1158.858027
1540     940.800000
1367     253.364333
Name: total_claim_amount, dtype: float64

# Using the Standard scaler-

In [15]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

In [16]:
pd.DataFrame(numericals_train_standardized).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.295306,-0.40704,-0.356405,-0.312232,1.561741,0.699822,-0.000818,1.087694,-1.212347,0.0
1,-0.468797,-1.234539,-0.813311,-0.411977,-0.813518,-0.419648,0.414047,-0.919376,1.000505,0.0
2,-0.655391,-1.234539,0.157615,-0.212486,1.417786,-0.419648,-0.830546,-0.919376,-0.630018,0.0
3,-0.161817,-1.234539,2.927611,-1.01045,0.4101,1.819292,-0.830546,1.087694,1.116971,0.0
4,0.008983,0.040122,0.214728,-0.810959,-0.561597,2.938761,-0.000818,1.087694,-0.397086,0.0


In [17]:
categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)
categoricals_train.head()

Unnamed: 0,state,response,coverage,education,employment_status,gender,location_code,marital_status,renew_offer_type,sales_channel,vehicle_class,vehicle_size,policy_category
4226,California,No,Extended,College,Medical Leave,F,Suburban,Single,Offer1,Agent,Four-Door Car,Medsize,Corporate
6100,California,No,Basic,Bachelor,Unemployed,F,Suburban,Married,Offer1,Agent,Two-Door Car,Small,Personal
3414,Oregon,No,Extended,High School or Below,Unemployed,F,Suburban,Single,Offer1,Web,Four-Door Car,Medsize,Personal
1540,California,No,Premium,Bachelor,Unemployed,F,Suburban,Married,Offer1,Agent,Sports Car,Medsize,Personal
1367,California,No,Basic,Master,Employed,F,Rural,Married,Offer1,Web,SUV,Large,Personal


In [21]:
data['sales_channel'].value_counts()

Agent          3477
Branch         2567
Call Center    1765
Web            1325
Name: sales_channel, dtype: int64

# Using one-hot encoding for the selected columns-

In [48]:
from sklearn.preprocessing import OneHotEncoder

categoricals_train_onehot = categoricals_train[['state', 'response', 'marital_status', 'gender', 'policy_category', 'renew_offer_type', 'sales_channel', 'vehicle_class']]
categoricals_test_onehot = categoricals_test[['state', 'response', 'marital_status', 'gender', 'policy_category', 'renew_offer_type', 'sales_channel', 'vehicle_class']]
                                           
                                             
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train_onehot)
categoricals_train_onehot_encoded = encoder.transform(categoricals_train_onehot).toarray()
categoricals_test_onehot_encoded = encoder.transform(categoricals_test_onehot).toarray()
pd.DataFrame(categoricals_train_onehot_encoded)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4562,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4563,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4564,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4565,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [49]:
categoricals_test_onehot_encoded

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Using ordinal encoding for the other selected columns

In [46]:
from sklearn.preprocessing import OrdinalEncoder     # props to Jessie for this code!!


categoricals_train_ordinal = categoricals_train[['coverage', 'employment_status', 'location_code', 'vehicle_size', 'sales_channel', 'education']]
categoricals_test_ordinal = categoricals_test[['coverage', 'employment_status', 'location_code', 'vehicle_size', 'sales_channel', 'education']]
enc = OrdinalEncoder()
encoder = enc.fit(categoricals_train_ordinal)
categoricals_train_ord_encoded= enc.transform(categoricals_train_ordinal)
categoricals_test_ord_encoded = enc.transform(categoricals_test_ordinal)
pd.DataFrame(categoricals_train_ord_encoded)

Unnamed: 0,0,1,2,3,4,5
0,1.0,2.0,1.0,1.0,0.0,1.0
1,0.0,4.0,1.0,2.0,0.0,0.0
2,1.0,4.0,1.0,1.0,3.0,3.0
3,2.0,4.0,1.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,3.0,4.0
...,...,...,...,...,...,...
4562,2.0,0.0,0.0,1.0,0.0,1.0
4563,0.0,1.0,0.0,2.0,1.0,0.0
4564,0.0,0.0,1.0,2.0,3.0,2.0
4565,0.0,2.0,1.0,0.0,2.0,0.0


In [47]:
categoricals_test_ord_encoded

array([[0., 1., 1., 1., 3., 0.],
       [1., 1., 0., 1., 2., 3.],
       [0., 1., 0., 0., 3., 3.],
       ...,
       [0., 1., 1., 1., 0., 3.],
       [0., 1., 1., 1., 1., 0.],
       [0., 4., 1., 2., 2., 3.]])

# Coverting to dataframes before concating

In [41]:
categoricals_train_onehot_encoded_df = pd.DataFrame(categoricals_train_onehot_encoded)
categoricals_train_ord_encoded_df = pd.DataFrame(categoricals_train_ord_encoded)
categoricals_train_encoded = pd.concat([categoricals_train_onehot_encoded_df, categoricals_train_ord_encoded_df], axis=1)


# Concating both the encoded to a single dataframe

In [51]:
categoricals_test_onehot_encoded_df = pd.DataFrame(categoricals_test_onehot_encoded)
categoricals_test_ord_encoded_df = pd.DataFrame(categoricals_test_ord_encoded)

categoricals_test_encoded = pd.concat([categoricals_test_onehot_encoded_df,categoricals_test_ord_encoded_df], axis=1)

In [42]:
categoricals_train_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,0.1,1.1,2.1,3.1,4.1,5.1
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,1.0,2.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,3.0,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,2.0,4.0,1.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4562,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
4563,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0
4564,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,3.0,2.0
4565,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,2.0,0.0


In [53]:
categoricals_test_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,0.1,1.1,2.1,3.1,4.1,5.1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4562,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0
4563,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4564,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0
4565,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


# Defining the X_train and X_test

In [52]:
X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)

In [54]:
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

In [55]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.295306,-0.407040,-0.356405,-0.312232,1.561741,0.699822,-0.000818,1.087694,-1.212347,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0
1,-0.468797,-1.234539,-0.813311,-0.411977,-0.813518,-0.419648,0.414047,-0.919376,1.000505,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,1.0,2.0,0.0,0.0
2,-0.655391,-1.234539,0.157615,-0.212486,1.417786,-0.419648,-0.830546,-0.919376,-0.630018,0.0,...,0.0,0.0,0.0,0.0,1.0,4.0,1.0,1.0,3.0,3.0
3,-0.161817,-1.234539,2.927611,-1.010450,0.410100,1.819292,-0.830546,1.087694,1.116971,0.0,...,0.0,0.0,1.0,0.0,2.0,4.0,1.0,1.0,0.0,0.0
4,0.008983,0.040122,0.214728,-0.810959,-0.561597,2.938761,-0.000818,1.087694,-0.397086,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4562,-0.029887,-0.449790,0.357512,0.286241,1.057898,1.819292,-0.000818,-0.919376,-1.445279,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
4563,7.436537,0.785085,2.642044,-1.509178,-0.633574,0.699822,-0.415682,1.087694,1.349903,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0
4564,-0.353543,-0.869056,-0.584858,1.084205,0.086201,-0.419648,0.414047,-0.919376,0.185244,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,3.0,2.0
4565,-0.574995,-0.470243,0.386068,1.582933,0.446089,-0.419648,-0.830546,1.087694,-0.280620,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,2.0,0.0


# Applying the KNN Neighbours Regressor

In [64]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

KNN = KNeighborsRegressor(n_neighbors=6)
KNN.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=6)

In [65]:
y_pred = KNN.predict(X_test)

# Evaluate the R-squared score of the model
r2score = r2_score(y_test, y_pred)
print('R2 score:', r2score)

R2 score: 0.5560774511692324


In [66]:
score = KNN.score(X_test, y_test)
score

0.5560774511692324

In [67]:
score_train = KNN.score(X_train, y_train)
score_train

0.7095555086741863

# Making linear model and linear regression

In [68]:
from sklearn import linear_model

LR= linear_model.LinearRegression()
LR.fit(X_train,y_train)

LinearRegression()

In [69]:
score1= LR.score(X_train,y_train)
score1

0.630422016718765

In [70]:
score= LR.score(X_test,y_test)
score

0.6032316743831969

In [71]:
predictions = LR.predict(X_test)
r2_score(y_test, predictions)

0.6032316743831969

# Defining function to iterate through various models

In [86]:
def train_and_test_models(models, X_train, y_train, X_test, y_test):
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        print(f"Model: {type(model).__name__}, R2 score: {r2:.3f}")

In [87]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

MLP = MLPRegressor(hidden_layer_sizes=(10,5))
models = [KNeighborsRegressor(), LinearRegression(), MLP]
train_and_test_models(models, X_train, y_train, X_test, y_test)

Model: KNeighborsRegressor, R2 score: 0.551
Model: LinearRegression, R2 score: 0.603
Model: MLPRegressor, R2 score: 0.609




## After using linear regression and MLP regresssor, it appears that my model score has increased which means that I have a better model than before.