In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/output.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464478 entries, 0 to 464477
Data columns (total 35 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   City                464478 non-null  object 
 1   State               464478 non-null  object 
 2   BankState           464478 non-null  object 
 3   ApprovalDate        464478 non-null  object 
 4   ApprovalFY          464478 non-null  int64  
 5   Term                464478 non-null  int64  
 6   NoEmp               464478 non-null  int64  
 7   CreateJob           464478 non-null  int64  
 8   RetainedJob         464478 non-null  int64  
 9   FranchiseCode       464478 non-null  int64  
 10  UrbanRural          464478 non-null  int64  
 11  RevLineCr           464478 non-null  int64  
 12  LowDoc              464478 non-null  int64  
 13  ChgOffDate          101572 non-null  object 
 14  DisbursementDate    464478 non-null  object 
 15  DisbursementGross   464478 non-nul

In [4]:
df.drop(columns=['ChgOffDate'], inplace=True)

In [5]:
#Encode Data
encoder = OrdinalEncoder()

categorical_cols = ["City", "State", "BankState", "ApprovalDate", "DisbursementDate", "Industry", "Active"]
df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

# Print the updated dataset
df.head()

Unnamed: 0,City,State,BankState,ApprovalDate,ApprovalFY,Term,NoEmp,CreateJob,RetainedJob,FranchiseCode,...,Industry,RealEstate,DaysTerm,Active,Recession,DaysToDisbursement,StateSame,SBA_AppvPct,AppvDisbursed,Is_Existing
0,6568.0,16.0,37.0,2765.0,1997,84,4,0,0,1,...,16.0,0,2520,4785.0,0,730,0,0.8,1,1
1,15225.0,16.0,17.0,2765.0,1997,60,2,0,0,1,...,0.0,0,1800,3444.0,0,92,1,0.8,1,1
2,1575.0,16.0,17.0,2765.0,1997,180,7,0,0,1,...,7.0,0,5400,7240.0,0,306,1,0.75,1,0
3,17347.0,7.0,9.0,2765.0,1997,120,19,0,0,1,...,9.0,0,3600,5256.0,0,122,0,0.75,1,0
4,21143.0,10.0,1.0,2765.0,1997,84,1,0,0,1,...,12.0,0,2520,4542.0,0,487,0,0.8,1,1


In [10]:
#Data Scaling
# create a MinMaxScaler object
scaler = MinMaxScaler()

#Normalize the dataset
normalized_dataset = scaler.fit_transform(df)

#convert the normalized numpy array back to a dataframe
df = pd.DataFrame(normalized_dataset, columns=df.columns)

#Print the updated dataset
df.head()

Unnamed: 0,City,State,BankState,ApprovalDate,ApprovalFY,Term,NoEmp,CreateJob,RetainedJob,FranchiseCode,...,Industry,RealEstate,DaysTerm,Active,Recession,DaysToDisbursement,StateSame,SBA_AppvPct,AppvDisbursed,Is_Existing
0,0.258644,0.313725,0.698113,0.368127,0.604651,0.159393,0.0004,0.0,0.0,1e-05,...,0.842105,0.0,0.159393,0.351735,0.0,0.340813,0.0,0.789474,1.0,1.0
1,0.599551,0.313725,0.320755,0.368127,0.604651,0.113852,0.0002,0.0,0.0,1e-05,...,0.0,0.0,0.113852,0.253161,0.0,0.290758,1.0,0.789474,1.0,1.0
2,0.062023,0.313725,0.320755,0.368127,0.604651,0.341556,0.0007,0.0,0.0,1e-05,...,0.368421,0.0,0.341556,0.532196,0.0,0.307547,1.0,0.736842,1.0,0.0
3,0.683114,0.137255,0.169811,0.368127,0.604651,0.227704,0.0019,0.0,0.0,1e-05,...,0.473684,0.0,0.227704,0.386357,0.0,0.293112,0.0,0.736842,1.0,0.0
4,0.832598,0.196078,0.018868,0.368127,0.604651,0.159393,0.0001,0.0,0.0,1e-05,...,0.631579,0.0,0.159393,0.333872,0.0,0.321748,0.0,0.789474,1.0,1.0


In [15]:
#Split Dataset
X = df.drop(columns=['SBA_Appv'])
y = df["SBA_Appv"]

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Split the train set further into train and validation sets (70% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

validate_rmse_score = []
test_rmse_score = []

Train set shape: (297265, 34)
Validation set shape: (74317, 34)
Test set shape: (92896, 34)


In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error

# Create an instance of DecisionTreeRegressor
#TODO: Parameter, matrix, sample to split branch
#Tunning: 3 type: Grid Search /Permutation, Random Search, Biased Search (select range)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = dt_reg.predict(X_val)
val_rmse = root_mean_squared_error(y_val, y_val_pred)
print("Validation RMSE:", val_rmse)
validate_rmse_score.append(val_rmse)

# Make predictions on the test set
y_test_pred = dt_reg.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_test_pred)
print("Test RMSE:", test_rmse)
test_rmse_score.append(test_rmse)

Validation RMSE: 0.00011889633758471743
Test RMSE: 0.00017424502575150651


In [17]:
linreg_tc = 5

# Perform cross-validation
linreg_cv = cross_val_score(LinearRegression(), X_train_val, y_train_val, cv=linreg_tc, scoring='neg_root_mean_squared_error')
score_val = np.mean(linreg_cv)  # Average RMSE across cross-validation folds

linreg = LinearRegression().fit(X_train_val, y_train_val)

y_pred_test = linreg.predict(X_test)

score_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("Validation RMSE:", -score_val) 
print("Test RMSE:", score_test)

NameError: name 'X_train_val' is not defined