# Logistic Regression Model

## Phase 1: Data Loading, Merging, and Cleaning

In [1]:
import pandas as pd

df_rejected_model = pd.read_csv('../../data/model_data/modelde_kullanilacak/rejected_approval_model.csv')
df_accepted_model = pd.read_csv('../../data/model_data/modelde_kullanilacak/accepted_approval_model.csv')

In [None]:
df_rejected_model.head()

In [None]:
df_accepted_model.head()

In [None]:
df_rejected_model.info()

In [None]:
df_accepted_model.info()

In [2]:
df_rejected_model.shape

(6330000, 11)

In [3]:
df_accepted_model.shape

(1809000, 11)

In [4]:
df_rejected_model['Application_Date'] = pd.to_datetime(df_rejected_model['Application_Date'])
display(df_rejected_model.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6330000 entries, 0 to 6329999
Data columns (total 11 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Amount_Requested   float64       
 1   Application_Date   datetime64[ns]
 2   DTI                float64       
 3   State              object        
 4   Employment_Length  int64         
 5   inq_last_6mths     float64       
 6   delinq_2yrs        float64       
 7   home_ownership     object        
 8   annual_inc         float64       
 9   Risk_Score         float64       
 10  Approval_Status    int64         
dtypes: datetime64[ns](1), float64(6), int64(2), object(2)
memory usage: 531.2+ MB


None

In [None]:
df_accepted_model.info()

In [5]:
df_accepted_model['Application_Date'] = pd.to_datetime(df_accepted_model['Application_Date'])
display(df_accepted_model.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1809000 entries, 0 to 1808999
Data columns (total 11 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Amount_Requested   float64       
 1   Application_Date   datetime64[ns]
 2   DTI                float64       
 3   State              object        
 4   Employment_Length  int64         
 5   inq_last_6mths     float64       
 6   delinq_2yrs        float64       
 7   home_ownership     object        
 8   annual_inc         float64       
 9   Risk_Score         float64       
 10  Approval_Status    int64         
dtypes: datetime64[ns](1), float64(6), int64(2), object(2)
memory usage: 151.8+ MB


None

In [6]:
df_combined_model = pd.concat([df_accepted_model, df_rejected_model], ignore_index=True)
display(df_combined_model.shape)
display(df_combined_model.info())

(8139000, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8139000 entries, 0 to 8138999
Data columns (total 11 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Amount_Requested   float64       
 1   Application_Date   datetime64[ns]
 2   DTI                float64       
 3   State              object        
 4   Employment_Length  int64         
 5   inq_last_6mths     float64       
 6   delinq_2yrs        float64       
 7   home_ownership     object        
 8   annual_inc         float64       
 9   Risk_Score         float64       
 10  Approval_Status    int64         
dtypes: datetime64[ns](1), float64(6), int64(2), object(2)
memory usage: 683.1+ MB


None

In [None]:
df_combined_model.head()

In [None]:
df_combined_model.shape

In [None]:
#df_combined_model.to_csv('full_model_data.csv', index=False)
#print("DataFrame exported to 'full_model_data.csv' successfully.")

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Missing_flag + Median Fill = The Most Accurate Strategy

- missing_flag ‚Üí Enables the model to correctly interpret missing information

- median imputation ‚Üí Keeps numerical features stable and meaningful

- scaling ‚Üí Ensures all numerical columns remain balanced


In [7]:
# Added binary flag columns indicating that the ‚Äì1 values represent actual missing data.
# Missing value flag columns
df_combined_model['inq_last_6mths_missing'] = (df_combined_model['inq_last_6mths'] == -1).astype(int)
df_combined_model['delinq_2yrs_missing'] = (df_combined_model['delinq_2yrs'] == -1).astype(int)
df_combined_model['annual_inc_missing'] = (df_combined_model['annual_inc'] == -1).astype(int)

In [8]:
import numpy as np
# Replaced ‚Äì1 values with NaN and filled them with the median.
df_combined_model['inq_last_6mths'] = df_combined_model['inq_last_6mths'].replace(-1, np.nan)
df_combined_model['delinq_2yrs'] = df_combined_model['delinq_2yrs'].replace(-1, np.nan)
df_combined_model['annual_inc'] = df_combined_model['annual_inc'].replace(-1, np.nan)

df_combined_model['inq_last_6mths'].fillna(df_combined_model['inq_last_6mths'].median(), inplace=True)
df_combined_model['delinq_2yrs'].fillna(df_combined_model['delinq_2yrs'].median(), inplace=True)
df_combined_model['annual_inc'].fillna(df_combined_model['annual_inc'].median(), inplace=True)


## Encoding Categorical Columns (One-Hot or Ordinal)

In [9]:
# Adjusted the Application_Date format.
df_combined_model['Application_Date'] = pd.to_datetime(df_combined_model['Application_Date'])

df_combined_model['app_year'] = df_combined_model['Application_Date'].dt.year
df_combined_model['app_month'] = df_combined_model['Application_Date'].dt.month
df_combined_model['app_day'] = df_combined_model['Application_Date'].dt.day

In [10]:
# Dropped the old date column.
df_combined_model.drop(columns=['Application_Date'], inplace=True)

In [11]:
# One-Hot Encoding: (State, home_ownership)
df_combined_model = pd.get_dummies(
    df_combined_model,
    columns=['State', 'home_ownership'],
    drop_first=True
)

In [12]:
cols_to_drop = [
    'annual_inc_missing',
    'delinq_2yrs_missing',
    'inq_last_6mths_missing',
    'home_ownership_UNKNOWN',
    'app_day',
    'Risk_Score',
    'home_ownership_MORTGAGE',
    'Employment_Length'
]

df_combined_model = df_combined_model.drop(columns=cols_to_drop, errors='ignore')

In [None]:
df_combined_model.head()

## Train/Test Split

In [13]:
# Separated the target variable.
#X = df_combined_model.drop(columns=['Approval_Status'])
#y = df_combined_model['Approval_Status']
X = df_combined_model.drop(columns=['Approval_Status', 'app_year', 'app_month'])
y = df_combined_model['Approval_Status']

In [14]:
X

Unnamed: 0,Amount_Requested,DTI,inq_last_6mths,delinq_2yrs,annual_inc,State_AL,State_AR,State_AZ,State_CA,State_CO,...,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT
0,3600.0,5.91,1.0,0.0,55000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24700.0,16.06,4.0,1.0,65000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20000.0,10.78,0.0,0.0,63000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,35000.0,17.06,0.0,0.0,110000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10400.0,25.37,3.0,1.0,104433.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8138995,35000.0,0.17,0.0,0.0,65000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8138996,7000.0,0.02,0.0,0.0,65000.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8138997,35000.0,0.47,0.0,0.0,65000.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8138998,2500.0,0.22,0.0,0.0,65000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Train/Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y        # Important for preserving class distribution
)


In [16]:
pd.set_option('display.max_rows', None)
print(df_combined_model.corr()['Approval_Status'].sort_values(ascending=False))
df_combined_model.corr()['Approval_Status'].sort_values(ascending=False)

# Calculated correlations and converted them into a DataFrame
#corr_df = df_combined_model.corr()['Approval_Status'].sort_values(ascending=False).reset_index()
#corr_df.columns = ['Feature', 'Correlation_with_Approval_Status']

# Save as CSV
#corr_df.to_csv('Approval_Status_Correlations.csv', index=False)

Approval_Status         1.000000
home_ownership_RENT     0.580048
inq_last_6mths          0.493116
home_ownership_OWN      0.299250
delinq_2yrs             0.298982
annual_inc              0.096024
Amount_Requested        0.053896
DTI                     0.037947
app_month               0.030812
State_CA                0.025543
State_MN                0.015686
State_CO                0.013822
State_NY                0.013200
State_CT                0.011215
State_NJ                0.011215
State_WA                0.009369
State_NV                0.008500
home_ownership_OTHER    0.007647
State_MA                0.007451
State_IL                0.007000
State_WV                0.006952
State_OR                0.006619
State_MD                0.006508
State_AZ                0.006177
State_NH                0.003868
State_DC                0.003007
State_VA                0.003000
State_VT                0.002817
State_WY                0.002696
home_ownership_NONE     0.002271
State_RI  

Approval_Status         1.000000
home_ownership_RENT     0.580048
inq_last_6mths          0.493116
home_ownership_OWN      0.299250
delinq_2yrs             0.298982
annual_inc              0.096024
Amount_Requested        0.053896
DTI                     0.037947
app_month               0.030812
State_CA                0.025543
State_MN                0.015686
State_CO                0.013822
State_NY                0.013200
State_CT                0.011215
State_NJ                0.011215
State_WA                0.009369
State_NV                0.008500
home_ownership_OTHER    0.007647
State_MA                0.007451
State_IL                0.007000
State_WV                0.006952
State_OR                0.006619
State_MD                0.006508
State_AZ                0.006177
State_NH                0.003868
State_DC                0.003007
State_VA                0.003000
State_VT                0.002817
State_WY                0.002696
home_ownership_NONE     0.002271
State_RI  

‚úîÔ∏è Why is stratify important?

Rejected vs. Accepted classes may be imbalanced.
Using stratify = y ensures that both train and test sets contain the same proportion of accepted/rejected cases.

Otherwise:

If the model sees no ‚Äúaccepted‚Äù cases in training ‚Üí it will behave inaccurately

AUC/Accuracy may become misleading

Train/Test distributions will differ ‚Üí resulting in an inconsistent and unreliable model

-----------

## Scaling

‚ö†Ô∏è Scaling is applied only to numerical columns

Scaling one-hot encoded columns is incorrect.

Therefore:

We will identify the numerical columns

Apply the scaler only to these columns

Missing_flag columns will not be scaled

One-hot encoded columns will not be scaled

--------------------

‚ùå If scaling is done before the train/test split ‚Üí this causes data leakage.

‚úîÔ∏è Scaling must be done after the train/test split ‚Üí this is the correct approach.

üî• Why don‚Äôt we scale before splitting the data?

If scaling is applied before the train/test split:

The scaler sees the mean and standard deviation of the entire dataset (train + test).

Information from the test set leaks into the training process ‚Üí the model indirectly knows the test statistics.

üî¥ This is called data leakage, and it artificially increases model performance.
In real-world deployment, the model will fail.

In [None]:
from sklearn.preprocessing import StandardScaler

# Select only numeric columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()

# Fit + transform only on the training set
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# Transform only on the test set
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=500, class_weight='balanced')

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Calculated the metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)  # dict formatƒ±nda al

# Converted the metrics to a DataFrame
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'ROC-AUC'],
    'Value': [accuracy, f1, roc_auc]
})

# Converted the classification report to a DataFrame
class_report_df = pd.DataFrame(class_report).transpose().reset_index()
class_report_df.rename(columns={'index': 'Class'}, inplace=True)

# Saved as CSV
metrics_df.to_csv('model_metrics.csv', index=False)
class_report_df.to_csv('classification_report.csv', index=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))