In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
train_data = pd.read_csv('train.csv')

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   loan_number             4000 non-null   int64  
 1   emp_length              3694 non-null   float64
 2   homeownership           4000 non-null   object 
 3   annual_income           4000 non-null   float64
 4   verified_income         4000 non-null   object 
 5   debt_to_income          3984 non-null   float64
 6   delinq_2y               4000 non-null   int64  
 7   total_credit_limit      4000 non-null   int64  
 8   total_credit_utilized   4000 non-null   int64  
 9   public_record_bankrupt  4000 non-null   int64  
 10  loan_purpose            4000 non-null   object 
 11  application_type        4000 non-null   object 
 12  loan_amount             4000 non-null   int64  
 13  term                    4000 non-null   int64  
 14  issue_month             4000 non-null   

In [7]:
# Define the features and target variable
features = ['loan_number', 'total_credit_limit', 'total_credit_utilized', 'public_record_bankrupt', 'term', 'annual_income']
target = 'interest_rate'

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[target], test_size=0.2, random_state=42)

In [9]:
# Perform feature selection using variance inflation factor (VIF)
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [10]:
# Check for multicollinearity
vif = calculate_vif(X_train)
high_vif_features = vif[vif['VIF'] > 5]['Features']
X_train.drop(high_vif_features, axis=1, inplace=True)
X_test.drop(high_vif_features, axis=1, inplace=True)

In [11]:
# Perform imputation for missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


In [12]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Create a Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:

# Train the model
model_rf.fit(X_train_scaled, y_train)

RandomForestRegressor(random_state=42)

In [15]:
# Make predictions on the test data
y_pred = model_rf.predict(X_test_scaled)

In [18]:
# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

RMSE: 4.507279866730875
