# Lab | Comparing regression models

### Used Libraries

In [104]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns 
import math
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    

#### recording the functions and solutions from week1_lab8, week4_lab1 and week_4lab2

####  Getting Data

In [2]:
data = pd.read_csv('we_fn_use_c_marketing_customer_value_analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [3]:
data = data.copy()

In [4]:
data.info

<bound method DataFrame.info of      Customer       State  Customer Lifetime Value Response  Coverage  \
0     BU79786  Washington              2763.519279       No     Basic   
1     QZ44356     Arizona              6979.535903       No  Extended   
2     AI49188      Nevada             12887.431650       No   Premium   
3     WW63253  California              7645.861827       No     Basic   
4     HB64268  Washington              2813.692575       No     Basic   
...       ...         ...                      ...      ...       ...   
9129  LA72316  California             23405.987980       No     Basic   
9130  PK87824  California              3096.511217      Yes  Extended   
9131  TD14365  California              8163.890428       No  Extended   
9132  UP19263  California              7524.442436       No  Extended   
9133  Y167826  California              2611.836866       No  Extended   

     Education Effective To Date EmploymentStatus Gender  Income  ...  \
0     Bachelor    

#### Global variables

In [5]:
discrete_values = pd.DataFrame()
continuous_values = pd.DataFrame()
categorical_values = pd.DataFrame()

numerical_clean = pd.DataFrame()
categorical_clean = pd.DataFrame()

####  Cleaning/Wrangling/EDA

* Change headers names.
* Deal with NaN values.
* Categorical Features.
* Numerical Features.
* Exploration.

In [6]:
def check_dataypes(data):
    numerical_discrete = data.select_dtypes(np.number)
    categorical = data.select_dtypes(object)
    
    return numerical_discrete,categorical

In [7]:
discrete_values,categorical_values = check_dataypes(data)

In [8]:
def clean_data_basic(discrete_values,categorical_values):
    
    #replace missing numericals with medians
    #replace missing categoricals with mode
    for col in discrete_values.columns:
        discrete_values[col] = discrete_values[col].fillna(np.median(discrete_values[col]))
    for col in categorical_values.columns:
        categorical_values[col] = categorical_values[col].fillna(categorical_values[col].mode())
     
    #standardise header names
    discrete_df = discrete_values.rename(columns=str.lower)
    discrete_df.columns = discrete_df.columns.str.replace(' ', '_')
    discrete_df.drop_duplicates() 
    cat_df = categorical_values.rename(columns=str.lower) 
    cat_df.columns = cat_df.columns.str.replace(' ', '_')
    cat_df.drop_duplicates() 
       
    return discrete_df,cat_df

In [9]:
numerical_clean,categorical_clean = clean_data_basic(discrete_values,categorical_values)

In [10]:
clean_df = pd.concat([numerical_clean, categorical_clean],axis=1)
clean_df = clean_df.drop(columns = ['customer','effective_to_date'],axis=1) #because of its difficult to encode even though it is an object

In [11]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9134 entries, 0 to 9133
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   customer_lifetime_value        9134 non-null   float64
 1   income                         9134 non-null   int64  
 2   monthly_premium_auto           9134 non-null   int64  
 3   months_since_last_claim        9134 non-null   int64  
 4   months_since_policy_inception  9134 non-null   int64  
 5   number_of_open_complaints      9134 non-null   int64  
 6   number_of_policies             9134 non-null   int64  
 7   total_claim_amount             9134 non-null   float64
 8   state                          9134 non-null   object 
 9   response                       9134 non-null   object 
 10  coverage                       9134 non-null   object 
 11  education                      9134 non-null   object 
 12  employmentstatus               9134 non-null   o

#### We can try to replace certain categorical coloumns to ease up the encoding as follows

In [12]:
display(clean_df['policy_type'].unique())
display(clean_df['policy'].unique())

array(['Corporate Auto', 'Personal Auto', 'Special Auto'], dtype=object)

array(['Corporate L3', 'Personal L3', 'Corporate L2', 'Personal L1',
       'Special L2', 'Corporate L1', 'Personal L2', 'Special L1',
       'Special L3'], dtype=object)

In [13]:
clean_df['policy_type'] = clean_df['policy_type'].replace({'Corporate Auto':'special', 'Special Auto':'special','Personal Auto':'Personal'})
clean_df['policy_type'].value_counts()

Personal    6788
special     2346
Name: policy_type, dtype: int64

In [14]:
clean_df['education'] = clean_df['education'].replace({'Master':'post graduate','Doctor':'post graduate'})
clean_df['education'].value_counts()

Bachelor                2748
College                 2681
High School or Below    2622
post graduate           1083
Name: education, dtype: int64

In [15]:
clean_df['marital_status'] = clean_df['marital_status'].replace({'Divorced':'Single'})
clean_df['marital_status'].value_counts()

Married    5298
Single     3836
Name: marital_status, dtype: int64

In [16]:
clean_df['employmentstatus'] = clean_df['employmentstatus'].replace({'Medical Leave':'not available','Disabled':'not available','Unemployed':'not available','Retired':'not available'})
clean_df['employmentstatus'].value_counts()

Employed         5698
not available    3436
Name: employmentstatus, dtype: int64

In [17]:
clean_df['vehicle_class'] = clean_df['vehicle_class'].replace({'Sports Car':'Special','Luxury SUV':'Special','Luxury Car':'Special'})
clean_df['vehicle_class'].value_counts()


Four-Door Car    4621
Two-Door Car     1886
SUV              1796
Special           831
Name: vehicle_class, dtype: int64

# Lab 3 of week4

#### We will start with removing outliers, if you have not already done so. We have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.

In [18]:
def remove_outliers(data):
    model_df = data.copy()
    numeric = model_df.select_dtypes(np.number)
    for col in numeric.columns:
        if col != 'total_claim_amount':
            iqr = np.percentile(model_df[col],75) - np.percentile(model_df[col],25)
            upper_limit = np.percentile(model_df[col],75) + 1.5*iqr
            lower_limit = np.percentile(model_df[col],25) - 1.5*iqr
            model_df = model_df[(model_df[col] > lower_limit) & (model_df[col] < upper_limit)]
        return model_df

In [19]:
model_df = remove_outliers(clean_df)
model_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_lifetime_value,8317.0,6279.177892,3240.426385,1898.007675,3746.751625,5470.060561,8162.617053,16407.66461
income,8317.0,37638.476855,30415.335684,0.0,0.0,33837.0,62229.0,99981.0
monthly_premium_auto,8317.0,90.949621,31.047412,61.0,68.0,81.0,107.0,298.0
months_since_last_claim,8317.0,15.034387,10.03885,0.0,6.0,14.0,23.0,35.0
months_since_policy_inception,8317.0,48.063124,27.92025,0.0,24.0,48.0,71.0,99.0
number_of_open_complaints,8317.0,0.393171,0.92714,0.0,0.0,0.0,0.0,5.0
number_of_policies,8317.0,3.037994,2.470996,1.0,1.0,2.0,4.0,9.0
total_claim_amount,8317.0,423.041312,275.674499,0.099007,268.471802,374.4,542.4,2893.239678


#### 1. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

In [20]:
X = model_df.drop(['total_claim_amount'], axis=1)
y = model_df['total_claim_amount']

In [21]:
#gloabal split data frame declarations
X_train_disc = pd.DataFrame
X_train_conti = pd.DataFrame
X_train_cat = pd.DataFrame

X_test_disc = pd.DataFrame
X_test_conti = pd.DataFrame
X_test_cat = pd.DataFrame

In [22]:
def split_train_test(X,y,test_size,random_state):
    X_train,X_test,y_train,y_test = train_test_split( X, y, test_size=test_size, random_state=random_state)
    
    global X_train_disc
    global X_train_conti
    global X_train_cat
    global X_test_disc
    global X_test_conti
    global X_test_cat
    
    X_train_disc = X_train.select_dtypes(np.int64)
    X_train_conti = X_train.select_dtypes(np.float64)
    X_train_cat = X_train.select_dtypes(object)

    
    X_test_disc = X_test.select_dtypes(np.int64)
    X_test_conti = X_test.select_dtypes(np.float64)
    X_test_cat = X_test.select_dtypes(object)
    
    return X_train,X_test,y_train,y_test

In [23]:
X_train,X_test,y_train,y_test = split_train_test( X, y,0.30,40)

#### 3.Create a copy of the dataframe for the data wrangling.

In [24]:
# see results above

#### 4. Normalize the continuous variables. You can use any one method you want.

we will be using a Power Transformer

In [25]:
pT = PowerTransformer()
pT.fit(X_train_conti)

X_train_continuous_tran_array = pT.transform(X_train_conti)
X_test_continuous_tran_array = pT.transform(X_test_conti)

#converting array of continuous values to Dataframe
X_train_continuous_trans_pd = pd.DataFrame(X_train_continuous_tran_array, columns=X_train_conti.columns,index=X_train_conti.index)
X_test_continuous_trans_pd = pd.DataFrame(X_test_continuous_tran_array, columns=X_test_conti.columns,index=X_test_conti.index)

#### 5.Encode the categorical variables

In [26]:
def encode_cat(data):
    data = data.drop(columns=['state', 'marital_status','policy_type', 'sales_channel','vehicle_class'], axis=1)
    return pd.get_dummies(data, drop_first=True)

In [27]:
X_train_cat_encoded = encode_cat(X_train_cat)
X_test_cat_encoded = encode_cat(X_test_cat)

In [28]:
#processed df for the regression model creation
X_train_reg = pd.concat([X_train_disc, X_train_continuous_trans_pd, X_train_cat_encoded], axis=1)
X_test_reg = pd.concat([X_test_disc, X_test_continuous_trans_pd, X_test_cat_encoded], axis=1)

#### Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [29]:
X_train_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5821 entries, 5526 to 8347
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   income                          5821 non-null   int64  
 1   monthly_premium_auto            5821 non-null   int64  
 2   months_since_last_claim         5821 non-null   int64  
 3   months_since_policy_inception   5821 non-null   int64  
 4   number_of_open_complaints       5821 non-null   int64  
 5   number_of_policies              5821 non-null   int64  
 6   customer_lifetime_value         5821 non-null   float64
 7   response_Yes                    5821 non-null   uint8  
 8   coverage_Extended               5821 non-null   uint8  
 9   coverage_Premium                5821 non-null   uint8  
 10  education_College               5821 non-null   uint8  
 11  education_High School or Below  5821 non-null   uint8  
 12  education_post graduate        

In [30]:
#X_test_reg.info()

##### Nope. we don't seem to have any object types, we are good to go!

#### Try a simple linear regression with all the data to see whether we are getting good results.

In [31]:
lm = LinearRegression()
lm.fit(X_train_reg, y_train)
print(f'Train score: {lm.score(X_train_reg, y_train)}')
print(f'Test score: {lm.score(X_test_reg, y_test)}')

Train score: 0.7555347211131858
Test score: 0.7333079599961732


##### we seem to have good enough accuracy. moving further

#### Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

In [52]:
def model_trainer(model,X_train_reg,y_train):
    model_reg = model
    model_reg.fit(X_train_reg, y_train)
    return model_reg

In [107]:
def model_validator(fitted_model, X_train, y_train, X_test, y_test):
    
    y_pred_train = fitted_model.predict(X_train)
    train_set_score = r2_score(y_train, y_pred_train)
   
    y_pred_test = fitted_model.predict(X_test)
    test_set_score = r2_score(y_test, y_pred_test)
    return train_set_score,test_set_score

In [108]:
#user input

In [109]:
# pass the name of the model in place of model name
#Example: fitted_model = model_trainer(name_of_the_model,X_train_reg,y_train)

fitted_model = model_trainer(LinearRegression(),X_train_reg,y_train)

In [110]:
model_validator(fitted_model,X_train_reg, y_train,X_test_reg, y_test)
#here X_train_reg and X_test_reg are the cleaned and normalised values that's used in building the model

(0.7555347211131858, 0.7333079599961732)

#### Use the function to check LinearRegressor and KNeighborsRegressor.

#### You can check also the MLPRegressor for this task!

In [120]:
def input_model_name():
    print("the following are the available models" )
    print("1. Linear Regression")
    print("2. KNeighborsRegressor")
    print("3. MLPRegressor")
    
    input_value = int(input("select the model:"))
    if (input_value == 1):
        fitted_model = model_trainer(LinearRegression(),X_train_reg,y_train)
        train_set_score,test_set_score = model_validator(fitted_model,X_train_reg, y_train,X_test_reg, y_test)
        print(f"Training score: {train_set_score}")
        print(f"Test score: {test_set_score}")
     
    if (input_value == 2):
        fitted_model = model_trainer(KNeighborsRegressor(n_neighbors=4),X_train_reg,y_train)
        train_set_score,test_set_score = model_validator(fitted_model,X_train_reg, y_train,X_test_reg, y_test)
        print(f"Training score: {train_set_score}")
        print(f"Test score: {test_set_score}")    
    if (input_value == 3):
        fitted_model = model_trainer(MLPRegressor(),X_train_reg,y_train)
        train_set_score,test_set_score = model_validator(fitted_model,X_train_reg, y_train,X_test_reg, y_test)
        print(f"Training score: {train_set_score}")
        print(f"Test score: {test_set_score}")

In [125]:
input_model_name()

the following are the available models
1. Linear Regression
2. KNeighborsRegressor
3. MLPRegressor
select the model:3
Training score: 0.5943049451598201
Test score: 0.5889357042072666


#### Check and discuss the results.

For the test set slect at 30 percent and random state of 40. The following were the results in the form of R2 score for different Models:
    

* Linear Regressor:
    Training score: 0.7555347211131858
    Test score: 0.7333079599961732

* KNN regressor:
    Training score: 0.6306871577192283
    Test score: 0.383922980791489

* MLP Regressor:
    Training score: 0.5943049451598201
    Test score: 0.5889357042072666