# Libraries 

In [2]:
import pandas as pd
import numpy as np
# Data preprocessing libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# Models libraries
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
# Evaluation libraries
from sklearn.metrics import classification_report, accuracy_score

# Credit card data analysis

In [4]:
df = pd.read_csv("credit_card_train.csv")
df.head()

Unnamed: 0,Num_Children,Gender,Income,Own_Car,Own_Housing,Credit_Card_Issuing
0,1,Male,40690,No,Yes,Denied
1,2,Female,75469,Yes,No,Denied
2,1,Male,70497,Yes,Yes,Approved
3,1,Male,61000,No,No,Denied
4,1,Male,56666,Yes,Yes,Denied


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Num_Children         400000 non-null  int64 
 1   Gender               400000 non-null  object
 2   Income               400000 non-null  int64 
 3   Own_Car              400000 non-null  object
 4   Own_Housing          400000 non-null  object
 5   Credit_Card_Issuing  400000 non-null  object
dtypes: int64(2), object(4)
memory usage: 18.3+ MB


In [6]:
df.describe()

Unnamed: 0,Num_Children,Income
count,400000.0,400000.0
mean,2.000892,72517.9975
std,1.410704,22955.502862
min,0.0,30000.0
25%,1.0,53336.0
50%,2.0,72077.0
75%,3.0,90669.0
max,11.0,119999.0


In [7]:
df.shape

(400000, 6)

In [8]:
df.nunique()

Num_Children              12
Gender                     2
Income                 87525
Own_Car                    2
Own_Housing                2
Credit_Card_Issuing        2
dtype: int64

In [9]:
gender_counts = df['Gender'].value_counts()
gender_counts

Gender
Male      200295
Female    199705
Name: count, dtype: int64

# Data Preprocessing 

In [11]:
categ_cols = ['Gender', 'Own_Car', 'Own_Housing']

preprocessor = ColumnTransformer(transformers=[('df', OneHotEncoder(), categ_cols)], remainder='passthrough')


In [12]:
X = df.drop(columns=['Credit_Card_Issuing'])
y = df['Credit_Card_Issuing'] # our target 0: denied, 1: approved
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

array([1, 1, 0, ..., 1, 0, 1])

# Data Splitting

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_encoded, test_size=0.2, random_state=42)

In [16]:
X_test.head(6)

Unnamed: 0,Num_Children,Gender,Income,Own_Car,Own_Housing
23218,4,Male,73649,No,No
20731,0,Female,44380,No,No
39555,0,Female,42337,Yes,No
147506,1,Male,47694,No,No
314215,2,Female,60443,No,No
190913,2,Male,92531,Yes,No


# Model Training

In [18]:
rfc = RandomForestClassifier()
# svm = SVC(kernel='rbf', C=1.0, probability=True)
log_reg = LogisticRegression()
xgb = XGBClassifier(use_label_encoder=False)                       

In [19]:
import joblib

In [20]:
def train_and_evaluate(model_name, model_instance):
    # the pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('standardization', StandardScaler()),
        ('ClassifierModel', model_instance)
    ])
    
    print("Training ",model_name)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the trained pipeline to a pkl file (so i can use it with apis)
    pipeline_filename = f'{model_name.lower().replace(" ", "_")}_pipeline.pkl'
    joblib.dump(model, pipeline_filename)
    print(f"Trained pipeline saved as {pipeline_filename}")
    
    y_pred = model.predict(X_test)

    # Convert the predictions back to 'Approved' and 'Denied' if needed
    # y_pred_labels = label_encoder.inverse_transform(y_pred)
    
    # Evaluate performance on the entire test set
    accuracy = accuracy_score(y_test, y_pred)
    print("\nOverall Model Performance:")
    print(f"Accuracy for {model_name}: {accuracy * 100:.2f}%")
    
    # Evaluate the model's performance
    print("Classification Report:")
    print(classification_report(y_test, y_pred))  # Using numeric labels
    print("Predicted labels:", y_pred[:10]) 

    # Check classification reports for males and females (Bias/Fairness Evaluation)
    male_indices = (X_test['Gender'] == 'Male').values  # Adjust as per your data format
    female_indices = (X_test['Gender'] == 'Female').values

    y_pred_male = model.predict(X_test[male_indices])
    y_true_male = y_test[male_indices]
    y_pred_female = model.predict(X_test[female_indices])
    y_true_female = y_test[female_indices]

    print("\nBias/Fairness Evaluation:")
    print(f"Male Classification Report for {model_name}:")
    print(classification_report(y_true_male, y_pred_male))
    print(f"Female Classification Report for {model_name}:")
    print(classification_report(y_true_female, y_pred_female))

    # Compare training and test performance (Variance)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nVariance Check:")
    print(f"Training Accuracy for {model_name}: {train_accuracy * 100:.2f}%")
    print(f"Test Accuracy for {model_name}: {accuracy * 100:.2f}%")

    # Get feature importances from the model (if supported by model used)
    print("\nFeature importances:")
    if hasattr(model.named_steps['ClassifierModel'], 'feature_importances_'):
        importances = model.named_steps['ClassifierModel'].feature_importances_
        feature_names = preprocessor.get_feature_names_out()
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        print(feature_importance_df)
    else:
        print(f"{model_name} does not support feature importances.")

In [21]:
train_and_evaluate("Random Forests", rfc)

Training  Random Forests
Trained pipeline saved as random_forests_pipeline.pkl

Overall Model Performance:
Accuracy for Random Forests: 96.47%
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     30931
           1       0.97      0.97      0.97     49069

    accuracy                           0.96     80000
   macro avg       0.96      0.96      0.96     80000
weighted avg       0.96      0.96      0.96     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for Random Forests:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     22910
           1       0.96      0.96      0.96     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for Random Forests:
              pre

In [22]:
# train_and_evaluate("Support Vector Machine", svm)

In [23]:
train_and_evaluate("Logistic Regression", log_reg)

Training  Logistic Regression
Trained pipeline saved as logistic_regression_pipeline.pkl

Overall Model Performance:
Accuracy for Logistic Regression: 97.25%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     30931
           1       0.98      0.98      0.98     49069

    accuracy                           0.97     80000
   macro avg       0.97      0.97      0.97     80000
weighted avg       0.97      0.97      0.97     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     22910
           1       0.97      0.97      0.97     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for Logistic Regr

In [24]:
train_and_evaluate("XGBoost", xgb)

Training  XGBoost


Parameters: { "use_label_encoder" } are not used.



Trained pipeline saved as xgboost_pipeline.pkl

Overall Model Performance:
Accuracy for XGBoost: 97.24%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     30931
           1       0.98      0.98      0.98     49069

    accuracy                           0.97     80000
   macro avg       0.97      0.97      0.97     80000
weighted avg       0.97      0.97      0.97     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     22910
           1       0.97      0.97      0.97     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for XGBoost:
              precision    recall  f1-score   support

           0   

## Specify how you considered each factor while training the model?

1. Performance: ensure the model accurately predicts whether a credit card application is approved or denied.

Implementation:

* Used metrics such as accuracy, precision, recall, and F1-score to evaluate the model’s performance on the test set.
* Used standardization to ensure that all numerical features have the same scale, which can significantly improve model performance and training stability. Here’s why standardization is important for your task:

2. Bias: avoid systematic errors or unfair outcomes for specific groups in the dataset (e.g., gender or income level).

Implementation:

* Conducted fairness evaluation by calculating separate classification reports for different demographic groups (e.g., male vs. female applicants).
* Checked for significant differences in metrics (e.g., precision, recall) between groups to identify potential biases.

3. Variance: ensure the model generalizes well and avoids overfitting or underfitting.

Implementation:

* Evaluated the training vs. test accuracy to identify high variance (overfitting) or high bias (underfitting).
* Used cross-validation to ensure the model performs consistently across different data splits.
* Applied regularization in models like Logistic Regression and set a limit on tree depth in models like Random Forest to prevent overfitting.

4. Fairness: ensure the model makes fair predictions across different demographic groups (e.g., gender or income levels).

Implementation:

* After training, evaluated the model separately for male and female applicants using metrics like accuracy, precision, and recall.
* Compared false-positive and false-negative rates for different groups to detect any disparities.

5. Model Interpretability:
make the model’s decision-making process transparent and understandable.

Implementation:

* For interpretable models like Logistic Regression, inspected feature coefficients to understand which features contribute most to the predictions.
* For models like Random Forest, extracted feature importances to determine the most influential factors in decision-making.
* Documented how preprocessing steps (e.g., one-hot encoding) and feature transformations affect model predictions.

## Chosen model based on my analysis

- Accuracy:

Logistic Regression and XGBoost have the highest test accuracy (97.25% and 97.24%, respectively) with Random Forest slightly behind at 96.47%.

- Variance:

Logistic Regression shows minimal variance between training and test accuracies (97.31% vs. 97.25%) that means it generalizes well.
XGBoost also has low variance (97.39% vs. 97.24%).
Random Forest shows a higher gap between training (99.77%) and test accuracy (96.47%) indicating slightly overfitting.

- Bias (Gender Fairness):

All models show similar performance for male and female groups but Logistic Regression and XGBoost have slightly better fairness scores compared to Random Forest.

- Overfitting:

Random Forest has a higher risk of overfitting due to its very high training accuracy compared to test accuracy.

**Best Model: Logistic Regression**
It has the highest test accuracy, minimal variance, excellent fairness, and high interpretability. It is also straightforward to implement and understand which with the task's focus on performance, fairness, and interpretability.

# Hyperparameter tuning the model

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

# Parameter grid 
param_grid = {
    'Logistic Regression__penalty': ['l1', 'l2', 'elasticnet'],
    'Logistic Regression__C': [0.01, 0.1, 1, 10, 100],  # regularization strength
    'Logistic Regression__solver': ['liblinear', 'saga'],  
    'Logistic Regression__max_iter': [100, 200, 500]
}

# GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(Pipeline([('preprocessor', preprocessor),('scaler', StandardScaler()),('Logistic Regression', LogisticRegression())]),
                           param_grid, cv=5, scoring='accuracy',verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Output the best parameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Train the best model on the full training set
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate on the test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy of Best Logistic Regression Model:", test_accuracy)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Users\yara.maraey\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Users\yara.maraey\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Users\yara.maraey\anaconda\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "D:\Users\yara.maraey\anaconda\Lib\site-package

Best Hyperparameters: {'Logistic Regression__C': 0.1, 'Logistic Regression__max_iter': 100, 'Logistic Regression__penalty': 'l1', 'Logistic Regression__solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9731
Test Accuracy of Best Logistic Regression Model: 0.97255


In [30]:
coefficients = best_model.named_steps['Logistic Regression'].coef_[0]
feature_names = preprocessor.get_feature_names_out()
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Absolute Importance': np.abs(coefficients)
})

feature_importance_df = feature_importance_df.sort_values(by='Absolute Importance', ascending=False)
print(feature_importance_df)

                   Feature  Coefficient  Absolute Importance
7        remainder__Income   -17.637697            17.637697
0        df__Gender_Female     2.370470             2.370470
5      df__Own_Housing_Yes    -1.689537             1.689537
1          df__Gender_Male    -1.479290             1.479290
4       df__Own_Housing_No     0.956643             0.956643
3          df__Own_Car_Yes    -0.606268             0.606268
2           df__Own_Car_No     0.463366             0.463366
6  remainder__Num_Children     0.009551             0.009551


df__Gender_Female (2.71) is an indication that the model might be favoring females potentially introducing a gender bias. However performing better than the other 2 models

In [32]:
log_reg_tuned = LogisticRegression(C=0.1, penalty="l1", solver="liblinear")

In [33]:
train_and_evaluate("Logistic Regression Tuned", log_reg)

Training  Logistic Regression Tuned
Trained pipeline saved as logistic_regression_tuned_pipeline.pkl

Overall Model Performance:
Accuracy for Logistic Regression Tuned: 97.25%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     30931
           1       0.98      0.98      0.98     49069

    accuracy                           0.97     80000
   macro avg       0.97      0.97      0.97     80000
weighted avg       0.97      0.97      0.97     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for Logistic Regression Tuned:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     22910
           1       0.97      0.97      0.97     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification 

In [34]:
# test pkl file
pipeline = joblib.load('logistic_regression_tuned_pipeline.pkl')
new_data = pd.DataFrame({
    'Num_Children': [2],
    'Gender': ['Male'],
    'Income': [92531],
    'Own_Car': ['No'],
    'Own_Housing': ['No']
})

predictions = pipeline.predict(new_data)
print("Predictions:", predictions)

Predictions: [0]
