In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
#from sklearn.impute import IterativeImputer

In [46]:
data = pd.read_csv('train.csv')

In [47]:
# Select the features and target variable
selected_features = ['emp_length', 'homeownership', 'annual_income', 'verified_income', 'debt_to_income',
                     'delinq_2y', 'total_credit_limit', 'total_credit_utilized', 'public_record_bankrupt',
                     'loan_purpose', 'application_type', 'loan_amount', 'term', 'issue_month']
target_variable = 'interest_rate'

In [48]:
# Drop rows with missing values
data = data[selected_features + [target_variable]].dropna()

In [49]:

# Encode categorical features
categorical_features = ['homeownership', 'verified_income', 'loan_purpose', 'application_type']
for feature in categorical_features:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

In [50]:
# Encode 'issue_month' feature
data['issue_month'] = pd.to_datetime(data['issue_month'], format='%b-%y')
data['issue_month_year'] = data['issue_month'].dt.year
data['issue_month'] = data['issue_month'].dt.month

In [51]:
# Split the data into training and test sets
X = data[selected_features]
y = data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=200, random_state=42)

In [53]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [54]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 4.120501513142302


In [55]:
test_data = pd.read_csv('test.csv')

In [56]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   loan_number             1000 non-null   int64  
 1   emp_length              900 non-null    float64
 2   homeownership           1000 non-null   object 
 3   annual_income           1000 non-null   float64
 4   verified_income         1000 non-null   object 
 5   debt_to_income          998 non-null    float64
 6   delinq_2y               1000 non-null   int64  
 7   total_credit_limit      1000 non-null   int64  
 8   total_credit_utilized   1000 non-null   int64  
 9   public_record_bankrupt  1000 non-null   int64  
 10  loan_purpose            1000 non-null   object 
 11  application_type        1000 non-null   object 
 12  loan_amount             1000 non-null   int64  
 13  term                    1000 non-null   int64  
 14  issue_month             1000 non-null   o

In [90]:
test_data.isnull().sum()

loan_number                 0
emp_length                100
homeownership               0
annual_income               0
verified_income             0
debt_to_income              2
delinq_2y                   0
total_credit_limit          0
total_credit_utilized       0
public_record_bankrupt      0
loan_purpose                0
application_type            0
loan_amount                 0
term                        0
issue_month                 0
dtype: int64

In [91]:
test_data['emp_length'] = test_data['emp_length'].fillna(test_data['emp_length'].mean())

In [95]:
test_data['debt_to_income'] = test_data['debt_to_income'].fillna(test_data['debt_to_income'].mean())

In [110]:
test_data.isnull().sum()

loan_number               0
emp_length                0
homeownership             0
annual_income             0
verified_income           0
debt_to_income            0
delinq_2y                 0
total_credit_limit        0
total_credit_utilized     0
public_record_bankrupt    0
loan_purpose              0
application_type          0
loan_amount               0
term                      0
issue_month               0
dtype: int64

In [111]:
# Instantiate the GradientBoostingRegressor model
model = GradientBoostingRegressor()

In [112]:
model.fit(X_train, y_train)

GradientBoostingRegressor()

In [113]:
# Predict interest_rate on the test data
y_pred = model.predict(X_test)

In [114]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 4.085002530836454


In [108]:
model.fit(X, y)

GradientBoostingRegressor()

In [115]:
y_pred

array([ 8.53584533,  9.23864922,  9.39802679,  9.41461879,  7.48814044,
       12.52127287, 12.31794612, 11.0248986 , 12.4176866 , 12.15853028,
       10.69113227, 12.22008785, 15.85690856, 12.63175145, 10.83083192,
       10.61375708,  8.58604988, 10.56282403, 14.39192015, 13.76154668,
        9.76294296, 10.3388476 , 11.82655778, 18.97216316, 12.50096936,
       12.82935339, 15.0153035 ,  9.80502659, 19.8903154 , 12.51552811,
       13.81727394, 10.26910559, 13.36939075, 13.27622669, 13.36911491,
       15.15812815, 11.3420117 , 12.56805863, 14.82420387, 10.42551332,
        7.82286337, 12.54648194,  9.08999586,  8.36443563, 13.12368496,
       11.76837662,  9.2659724 , 14.21505842,  9.29110447,  9.49946796,
        6.51464941, 12.03726097, 15.9455565 , 12.9603091 , 11.08459274,
       12.65286702, 14.40552897, 11.98400258,  9.81502378, 10.38150706,
       10.47875283, 13.40293254, 14.44708313, 14.15960971, 10.52325948,
       11.75361554, 14.98240887,  9.14440843, 11.96129637, 13.42

In [117]:
submission_gbx = test_data[["loan_number"]]
submission_gbx

Unnamed: 0,loan_number
0,4000
1,4001
2,4002
3,4003
4,4004
...,...
995,4995
996,4996
997,4997
998,4998


In [120]:
test_predictions = model.predict(test_features_imputed)



ValueError: X has 15 features, but GradientBoostingRegressor is expecting 14 features as input.