In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

# Dataset is now stored in a Pandas Dataframe
# (hosted on Google Drive, should be updated to match dataset in GitHub)
url = 'https://drive.google.com/uc?id=1XxbB5g_ohunOsbvm8A1FYH7ZVTqCQxpC'
df = pd.read_csv(url)
print(df)


null_features = [feature for feature in df.columns if df[feature].isnull().sum() > 0]

nan_numeric = []
nan_categoric = []

for feature in null_features:
    if df[feature].dtype != 'O':
        nan_numeric.append(feature)
    else:
        nan_categoric.append(feature)


# Filter out unsuccessful loans and null values
df = df[df['Loan_Status'] == 'Y']

#dealing with missing values
for feature in nan_categoric:
    df[feature] = np.where(df[feature].isnull(), df[feature].mode(), df[feature])

for feature in nan_numeric:
    df[feature] = np.where(df[feature].isnull(), int(df[feature].median()), df[feature])




print(df)

#converting catergorical data to float
label_encoder = preprocessing.LabelEncoder()
df['Gender']= label_encoder.fit_transform(df['Gender'])
df['Married']= label_encoder.fit_transform(df['Married'])
df['Education']= label_encoder.fit_transform(df['Education'])
df['Self_Employed']= label_encoder.fit_transform(df['Self_Employed'])
df['Property_Area']= label_encoder.fit_transform(df['Property_Area'])

print(df)

#Feature scaling
scaler = StandardScaler()
df[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History']] = scaler.fit_transform(
    df[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History']])

# Split the dataset into features (X) and the target variable (y)
# Note: please notify frontend when columns or column order is changed
X = df.drop(['Loan_ID', 'Loan_Status', 'Dependents', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Unnamed: 0'], axis=1)
print(X.columns)
# Convert input to number matrix for compatibility with serializer
X = X.to_numpy()
y = df[['LoanAmount']]
print(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

     Unnamed: 0   Loan_ID  Gender Married Dependents     Education  \
0             0  LP001002    Male      No          0      Graduate   
1             1  LP001003    Male     Yes          1      Graduate   
2             2  LP001005    Male     Yes          0      Graduate   
3             3  LP001006    Male     Yes          0  Not Graduate   
4             4  LP001008    Male      No          0      Graduate   
..          ...       ...     ...     ...        ...           ...   
495         495  LP002586  Female     Yes          1      Graduate   
496         496  LP002587    Male     Yes          0  Not Graduate   
497         497  LP002588    Male     Yes          0      Graduate   
498         498  LP002600    Male     Yes          1      Graduate   
499         499  LP002602    Male      No          0      Graduate   

    Self_Employed  ApplicantIncome  CoapplicantIncome  LoanAmount  \
0              No             5849                0.0         NaN   
1              No    

In [3]:


# Initialize and train a Linear Regression model (current best = Huber, r2 = 0.5944)
model = linear_model.HuberRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
print(y_pred)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)



[ 89.35879088 146.4058089  157.80590994 137.6012465  103.51027311
 114.20001419 133.59407665 123.61720532 156.74704554 120.91432342
 180.17463688 536.22976724 131.31174049  93.52933796 101.94688238
 127.30315605 137.7433716  172.18480515 160.17842918 117.55932793
  95.44188454  92.79657925 115.90500749 179.54665756 139.9907512
 180.29743402  93.23980183 182.91934609 123.61775759 124.33876215
 109.19393855 114.05479696 120.37931404 117.57090361  99.27808169
 135.92999196 227.64169862 101.48305096  96.51443508 149.04224104
 103.5081416  110.23693018 113.56850041 220.73081862 121.10304445
 132.56359438 189.71299888 134.1778543  121.73947994 134.3699721
 124.44197139 110.66265045 135.40323628  92.33833147 133.34663499
 169.74210616 126.16410787  98.5529342  108.49977516 127.74000361
 168.05001696 138.50764391 174.53780864 148.31712771  89.6953979
 161.13213668 122.48183262 230.82679147 125.33758492]
Mean Squared Error: 2097.573028357564
R-squared: 0.5944538978621194


  y = column_or_1d(y, warn=True)


In [2]:
# Serialize the model to an onnx file
!pip install skl2onnx
import skl2onnx
onx = skl2onnx.to_onnx(model, X=X_train)
skl2onnx.helpers.onnx_helper.save_onnx_model(onx, './model.onnx')

Collecting skl2onnx
  Downloading skl2onnx-1.15.0-py2.py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.7/294.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx>=1.2.1 (from skl2onnx)
  Downloading onnx-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Collecting onnxconverter-common>=1.7.0 (from skl2onnx)
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf>=3.20.2 (from onnx>=1.2.1->skl2onnx)
  Downloading protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
Installing collect

b'\x08\x08\x12\x08skl2onnx\x1a\x061.15.0"\x07ai.onnx(\x002\x00:\xd9\x02\n\'\n\x01X\n\x04coef\x12\nmultiplied\x1a\x06MatMul"\x06MatMul:\x00\n)\n\nmultiplied\n\tintercept\x12\x04resh\x1a\x03Add"\x03Add:\x00\n2\n\x04resh\n\x0cshape_tensor\x12\x08variable\x1a\x07Reshape"\x07Reshape:\x00\x12\x16ONNX(LinearRegression)*N\x08\x08\x08\x01\x10\x0bB\x04coefR@\x19opI%r0@\x0c;\xdcj~\x95\x19@\x1ebI\xfc\x7f\t/\xc0\xec\x9a\xfcl \x8a\xf9\xbf\x00d\xf58aaG@\x1as\x95\xbe\xae\x0f(@&\xec\xfa\x0fj\xc2\x14\xc03\xb76\xa9?O\x06\xc0*\x19\x08\x01\x10\x0bB\tinterceptR\x08\x98<\x03\x109}_@*\x1f\x08\x02\x10\x07:\x0b\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01\x01B\x0cshape_tensorZ\x11\n\x01X\x12\x0c\n\n\x08\x0b\x12\x06\n\x00\n\x02\x08\x08b\x18\n\x08variable\x12\x0c\n\n\x08\x0b\x12\x06\n\x00\n\x02\x08\x01B\x04\n\x00\x10\r'