In [32]:
import numpy as np
import pandas as pd
% matplotlib inline

In [33]:
df = pd.read_csv("loan_new.csv")

In [34]:
df.shape

(887379, 15)

In [35]:
df = df.sample(frac=0.2)

In [36]:
df.shape

(177476, 15)

In [37]:
df.head()

Unnamed: 0,funded_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,dti,earliest_cr_line,revol_bal,revol_util,total_acc
486732,6400,36 months,12.59,C,C2,6 years,MORTGAGE,25000.0,Verified,Current,34.85,Mar-1994,14776.0,69.7,16.0
374356,14400,60 months,14.99,C,C5,10+ years,MORTGAGE,62000.0,Source Verified,Current,28.94,Sep-1991,15020.0,39.6,34.0
747161,6400,36 months,7.26,A,A4,8 years,RENT,60000.0,Source Verified,Current,14.06,Mar-2000,5475.0,33.8,23.0
8561,2500,36 months,5.42,A,A1,10+ years,OWN,48000.0,Not Verified,Fully Paid,9.2,Jan-1995,1421.0,4.6,17.0
330352,18000,36 months,6.03,A,A1,10+ years,MORTGAGE,160000.0,Verified,Fully Paid,20.93,Nov-1995,26011.0,55.9,33.0


In [38]:
df.emp_length = np.where(df.emp_length == "10+ years", ">=10", "<10")

In [39]:
df.home_ownership = np.where(df.home_ownership.isin(["OTHER", "NONE", "ANY"]), "MORTGAGE", df.home_ownership)

In [40]:
df.dti = np.where(df.dti > 40, 40, df.dti)

In [41]:
df.revol_bal = np.where(df.revol_bal > 50000, 50000,  df.revol_bal)

In [42]:
df.revol_util = np.where(df.revol_util > 100, 100,  df.revol_util)

In [43]:
df.annual_inc = np.where(df.annual_inc > 150000, 150000, df.annual_inc)

In [44]:
df.columns

Index(['funded_amnt', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'dti', 'earliest_cr_line', 'revol_bal', 'revol_util', 'total_acc'],
      dtype='object')

In [45]:
X = df[['funded_amnt', 'term', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 
       'dti', 'revol_bal', 'revol_util']]

In [46]:
y = df['int_rate']

In [47]:
numerical_variable = ["funded_amnt", "annual_inc", "dti", "revol_bal", "revol_util"]
categorical_variable = ["term", "emp_length", "home_ownership", "verification_status"]

In [48]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [49]:
numerical_transformer = Pipeline(steps=[
    ('missing_value_handler', SimpleImputer(strategy="median")),
    ('scaling', StandardScaler())
])

In [50]:
categorical_transformer = Pipeline(steps=[
    ('missing_value_handler', SimpleImputer(strategy="most_frequent")),
    ('one_hot_encoding', OneHotEncoder(sparse=False))
])

In [51]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric_transform', numerical_transformer, numerical_variable),
    ('categorical_transformer', categorical_transformer, categorical_variable)
])

In [52]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [53]:
model_1 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('linear_model', LinearRegression())
])

In [54]:
model_2 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('ridge_model', Ridge(alpha=0.1))
])

In [55]:
model_3 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('lasso_model', Lasso(alpha=0.1))
])

In [56]:
from sklearn.neighbors import KNeighborsRegressor
model_4 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('knn_model', KNeighborsRegressor(n_neighbors=7))
])

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

In [58]:
model_1.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric_transform', Pipeline(memory=None,
     steps=[('missing_value_handler', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
  ...r_model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [59]:
model_2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric_transform', Pipeline(memory=None,
     steps=[('missing_value_handler', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
  ...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [60]:
model_3.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric_transform', Pipeline(memory=None,
     steps=[('missing_value_handler', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
  ...e=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [61]:
model_4.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric_transform', Pipeline(memory=None,
     steps=[('missing_value_handler', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
  ...wski',
          metric_params=None, n_jobs=None, n_neighbors=7, p=2,
          weights='uniform'))])

In [62]:
y_pred_1 = model_1.predict(X_test)

In [63]:
y_pred_2 = model_2.predict(X_test)

In [64]:
y_pred_3 = model_3.predict(X_test)

In [65]:
y_pred_4 = model_4.predict(X_test)

In [66]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred_1))

3.5942499874718807

In [67]:
np.sqrt(mean_squared_error(y_test, y_pred_2))

3.594207994961242

In [68]:
np.sqrt(mean_squared_error(y_test, y_pred_3))

3.6231237571807524

In [69]:
np.sqrt(mean_squared_error(y_test, y_pred_4))

3.7078558395337256