# Building a Regression Model (KMeans Clustering) using Machine Learning 

Applying Logistic Regression, K-NN Regressor, Random Forest Regressor and Multi Layer Perceptron Regressor to each of our clusters. We also calculate the RMSE and MAE value for the train and test data as a performance metric for each of our models to understand the better performing model.

Import necessary packages and define mean_absolute_percentage_error

In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import train_test_split  
from sklearn.utils import check_array


def mean_absolute_percentage_error(y_test,x_predict):
    np.seterr(divide='ignore',invalid='ignore')
    y_test,x_predict=np.array(y_test),np.array(x_predict) 
    return np.mean(np.abs((y_test - x_predict)/y_test))*100




Load Cluster

In [3]:
loandata=pd.read_csv('../cluster2.csv',encoding="ISO-8859-1")



In [4]:
loandata=loandata[loandata.int_rate!=0]

Divide test and train in the 30-70 ratio and separating the int_rate column into y_train, y_test dataframes for building our model

In [5]:
loandataforprediction=loandata[['loan_amnt','int_rate','term','emp_length','home_ownership','annual_inc','verification_status','purpose','addr_state','dti','delinq_2yrs','Risk_Score','inq_last_6mths','open_acc','revol_bal','revol_util','total_acc','mths_since_last_major_derog','funded_amnt_inv','installment','application_type','pub_rec','addr_state']]

loandataforprediction=pd.get_dummies(loandataforprediction, columns=["purpose"])
loandataforprediction=pd.get_dummies(loandataforprediction,columns=["application_type"])
train,test = train_test_split(loandataforprediction, train_size = 0.7)
y_train = train['int_rate']
y_test = test['int_rate']
x_train = train.loc[:, train.columns != 'int_rate']
x_test = test.loc[:, test.columns != 'int_rate']

### Machine Learning algorithms used :

We have applied below machine learning algorithms and evaluated the performance of the data on each model by calculating the MAE and RMSE for test and train data

## Linear Regression

In [5]:
#Multple Linear Regression Case 1:
reg=linear_model.LinearRegression()
reg.fit(x_train,y_train)
predicted_train=reg.predict(x_train)
predicted_test=reg.predict(x_test)
print("Testing mean absolute error : %f" %mean_absolute_error(y_test,predicted_test))
print("Training mean absolute error: %f" %mean_absolute_error(y_train,predicted_train))
rmse=np.sqrt(mean_squared_error(y_test,predicted_test))
print("RMSE Value for Testing")
print(rmse)
rmse=np.sqrt(mean_squared_error(y_train,predicted_train))
print("RMSE Value for Training")
print(rmse)
print("MAPE Value for Testing")
print(mean_absolute_percentage_error(y_test, predicted_test))
print("MAPE Value for Training")
print(mean_absolute_percentage_error(y_train, predicted_train))


Testing mean absolute error : 0.780577
Training mean absolute error: 0.781833
RMSE Value for Testing
1.03688577493
RMSE Value for Training
1.0216601828
MAPE Value for Testing
6.22663423513
MAPE Value for Training
6.22332649453


## KNN Regressor

In [6]:
  
#KNN Regressor


# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(x_train, y_train)
# Make point predictions on the test set using the fit model.
predicted_train = knn.predict(x_train)
predicted_test = knn.predict(x_test)
print("Testing mean absolute error : %f" %mean_absolute_error(y_test,predicted_test))
print("Training mean absolute error: %f" %mean_absolute_error(y_train,predicted_train))
rmse=np.sqrt(mean_squared_error(y_test,predicted_test))
print("RMSE Value for Testing")
print(rmse)
rmse=np.sqrt(mean_squared_error(y_train,predicted_train))
print("RMSE Value for Training")
print(rmse)
print("MAPE Value for Testing")
print(mean_absolute_percentage_error(y_test, predicted_test))
print("MAPE Value for Training")
print(mean_absolute_percentage_error(y_train, predicted_train))


Testing mean absolute error : 4.202137
Training mean absolute error: 3.387429
RMSE Value for Testing
5.33109010476
RMSE Value for Training
4.31766467233
MAPE Value for Testing
33.1486911777
MAPE Value for Training
26.6583848685


## Random Forest Regressor 

In [6]:

#RandomForest Regressor
from sklearn.ensemble import RandomForestRegressor

z='auto'
x=100
y=1
model=RandomForestRegressor(n_estimators=x,max_features=z,oob_score=True,n_jobs=-1,random_state=50,min_samples_leaf=y)
model.fit(x_train,y_train)
predicted_train=model.predict(x_train)
predicted_test=model.predict(x_test)
print("Tuning Combination") 
print(x," ",y ," ", z,"")
print("Testing mean absolute error : %f" % mean_absolute_error(y_test,predicted_test))
print("Training mean absolute error: %f" %mean_absolute_error(y_train,predicted_train))
rmse=np.sqrt(mean_squared_error(y_test,predicted_test))
print("RMSE Value for testing")
print(rmse)
rmse=np.sqrt(mean_squared_error(y_train,predicted_train))
print("RMSE Value for training")
print(rmse)
print("MAPE Value for Testing")
print(mean_absolute_percentage_error(y_test, predicted_test))
print("MAPE Value for Training")
print(mean_absolute_percentage_error(y_train, predicted_train))


Tuning Combination
100   1   auto 
Testing mean absolute error : 0.608058
Training mean absolute error: 0.229599
RMSE Value for testing
1.3077769871895115
RMSE Value for training
0.49289877073176924
MAPE Value for Testing
5.60486177159857
MAPE Value for Training
2.1112728737989865


In [7]:
from sklearn.externals import joblib
joblib.dump(model,'KMeans_Cluster_2_Regression.pkl')

['KMeans_Cluster_2_Regression.pkl']

## MLP Regressor 

In [9]:

#Neural Network Case 1:
#List of Alpha Values
from sklearn.neural_network import MLPRegressor
hidden_layers=[]
alpha=[10]


for a in alpha:  
    nn = MLPRegressor(hidden_layer_sizes=(100,),  activation='relu', solver='adam', alpha=a,batch_size='auto',learning_rate='constant',
    learning_rate_init=0.001, max_iter=1000, shuffle=True,random_state=50, tol=0.0001, verbose=False, warm_start=False,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    n = nn.fit(x_train,y_train)
    predicted_trainnn=n.predict(x_train)
    predicted_testnn=n.predict(x_test)
    print("Testing mean absolute error : %f" % mean_absolute_error(y_test,predicted_testnn))
    print("training mean absolute error: %f"% mean_absolute_error(y_train,predicted_trainnn))
    rmse=np.sqrt(mean_squared_error(y_test,predicted_testnn))
    print("RMSE Value For Testing")
    print(rmse)
    rmse=np.sqrt(mean_squared_error(y_train,predicted_trainnn))
    print("RMSE Value For Training")
    print(rmse)
    print("MAPE Value for Testing")
    print(mean_absolute_percentage_error(y_test, predicted_testnn))
    print("MAPE Value for Training")
    print(mean_absolute_percentage_error(y_train, predicted_trainnn))

Testing mean absolute error : 375.494396
training mean absolute error: 373.435812
RMSE Value For Testing
1016.1347516
RMSE Value For Training
612.172483269
MAPE Value for Testing
3047.73570081
MAPE Value for Training
3021.12738845


# Conclusion :

#### We see that the Random Forest Regressor outperforms for this cluster of FICO Score clustering with MAE for test and train as 0.608 and 0.229 