In [221]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor
import joblib
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
import xgboost as xgb

In [222]:
train2_data = pd.read_csv('train.csv')
train2_data = train2_data.dropna()
train2_data.head().T

Unnamed: 0,0,1,2,3,4
loan_number,0,1,2,3,4
emp_length,3.0,10.0,10.0,10.0,10.0
homeownership,OWN,OWN,RENT,MORTGAGE,MORTGAGE
annual_income,30000.0,60000.0,84000.0,131000.0,188500.0
verified_income,Source Verified,Verified,Source Verified,Source Verified,Not Verified
debt_to_income,20.72,1.0,15.23,27.95,7.93
delinq_2y,0,0,0,0,0
total_credit_limit,48400,31500,74900,408405,85595
total_credit_utilized,16832,1251,24189,78141,57993
public_record_bankrupt,0,0,0,0,0


In [223]:
test2_data = pd.read_csv('test.csv')
test2_data.head().T

Unnamed: 0,0,1,2,3,4
loan_number,4000,4001,4002,4003,4004
emp_length,4.0,1.0,2.0,10.0,6.0
homeownership,MORTGAGE,MORTGAGE,RENT,MORTGAGE,OWN
annual_income,75000.0,75000.0,50000.0,140000.0,80000.0
verified_income,Source Verified,Verified,Source Verified,Source Verified,Source Verified
debt_to_income,8.46,33.97,16.37,14.23,6.84
delinq_2y,0,1,0,0,0
total_credit_limit,293947,128979,39814,57595,193672
total_credit_utilized,164954,59745,25131,43891,13334
public_record_bankrupt,0,0,0,1,0


In [224]:
features = ['loan_number', 'debt_to_income', 'delinq_2y', 'total_credit_limit', 'total_credit_utilized', 'public_record_bankrupt', 'term', 'annual_income']
target = 'interest_rate'

In [225]:
train2_data, test2_data = train_test_split(train2_data, test_size=0.2, random_state=42)

In [226]:
train2_data = train2_data.dropna(subset=features + [target])

In [227]:
X_train = train2_data[features]
y_train = train2_data[target]
X_test = test2_data[features]
y_test = test2_data[target]

In [228]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [229]:
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [230]:
y_pred = model.predict(X_test)

In [231]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

RMSE: 4.333892153272571


In [232]:
X_train = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train)
results = ols_model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          interest_rate   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     101.4
Date:                Mon, 29 May 2023   Prob (F-statistic):          1.61e-149
Time:                        15:31:47   Log-Likelihood:                -8569.4
No. Observations:                2954   AIC:                         1.716e+04
Df Residuals:                    2945   BIC:                         1.721e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      4

In [234]:
X_train = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_train).fit()

In [235]:
r_squared = model_sm.rsquared
adjusted_r_squared = model_sm.rsquared_adj

print("R-squared:", r_squared)
print("Adjusted R-squared:", adjusted_r_squared)

R-squared: 0.2158865966074548
Adjusted R-squared: 0.21375657717548868


In [240]:
X_test_sm = sm.add_constant(X_test)

In [241]:
y_pred_sm = model_sm.predict(X_test)

In [242]:
# Add a constant column to the test data
X_test_sm = sm.add_constant(X_test)

# Use the trained model to predict the target variable
y_pred_sm = model_sm.predict(X_test)


In [243]:
y_pred_sm

429      9.159530
3857     7.672950
603     11.941737
1396    11.031815
888     10.559127
          ...    
2200    15.845935
3129     7.871210
2173    10.172627
1747    10.202511
3941    12.336241
Length: 739, dtype: float64

In [245]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_sm))
print("RMSE:", rmse)

RMSE: 4.398982989022776


In [None]:
features = ['loan_number', 'total_credit_limit', 'total_credit_utilized', 'public_record_bankrupt', 'term', 'annual_income']
target = 'interest_rate'
X = train2_data[features]
y = train2_data[target]