# Libraries 

In [9]:
import pandas as pd
import numpy as np
# Data preprocessing libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# Models libraries
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
# Evaluation libraries
from sklearn.metrics import classification_report, accuracy_score

# Credit card data analysis

In [17]:
df = pd.read_csv("credit_card_train.csv")
df.head()

Unnamed: 0,Num_Children,Gender,Income,Own_Car,Own_Housing,Credit_Card_Issuing
0,1,Male,40690,No,Yes,Denied
1,2,Female,75469,Yes,No,Denied
2,1,Male,70497,Yes,Yes,Approved
3,1,Male,61000,No,No,Denied
4,1,Male,56666,Yes,Yes,Denied


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Num_Children         400000 non-null  int64 
 1   Gender               400000 non-null  object
 2   Income               400000 non-null  int64 
 3   Own_Car              400000 non-null  object
 4   Own_Housing          400000 non-null  object
 5   Credit_Card_Issuing  400000 non-null  object
dtypes: int64(2), object(4)
memory usage: 18.3+ MB


In [19]:
df.describe()

Unnamed: 0,Num_Children,Income
count,400000.0,400000.0
mean,2.000892,72517.9975
std,1.410704,22955.502862
min,0.0,30000.0
25%,1.0,53336.0
50%,2.0,72077.0
75%,3.0,90669.0
max,11.0,119999.0


In [22]:
df.shape

(400000, 6)

In [25]:
df.nunique()

Num_Children              12
Gender                     2
Income                 87525
Own_Car                    2
Own_Housing                2
Credit_Card_Issuing        2
dtype: int64

# Data Preprocessing 

In [28]:
categ_cols = ['Gender', 'Own_Car', 'Own_Housing']

preprocessor = ColumnTransformer(transformers=[('df', OneHotEncoder(), categ_cols)], remainder='passthrough')


In [30]:
X = df.drop(columns=['Credit_Card_Issuing'])
y = df['Credit_Card_Issuing'] # our target 0: denied, 1: approved
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded


array([1, 1, 0, ..., 1, 0, 1])

# Data Splitting

In [33]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y_encoded, test_size=0.2, random_state=42)

In [64]:
X_test.head(6)

Unnamed: 0,Num_Children,Gender,Income,Own_Car,Own_Housing
23218,4,Male,73649,No,No
20731,0,Female,44380,No,No
39555,0,Female,42337,Yes,No
147506,1,Male,47694,No,No
314215,2,Female,60443,No,No
190913,2,Male,92531,Yes,No


# Model Training

In [38]:
rfc = RandomForestClassifier()
# svm = SVC(kernel='rbf', C=1.0, probability=True)
log_reg = LogisticRegression()
xgb = XGBClassifier(use_label_encoder=False)                       

In [40]:
import joblib

In [46]:
def train_and_evaluate(model_name, model_instance):
    # the pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('standardization', StandardScaler()),
        ('ClassifierModel', model_instance)
    ])
    
    print("Training ",model_name)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the trained pipeline to a file
    pipeline_filename = f'{model_name.lower().replace(" ", "_")}_pipeline.pkl'
    joblib.dump(model, pipeline_filename)
    print(f"Trained pipeline saved as {pipeline_filename}")
    
    y_pred = model.predict(X_test)

    # Convert the predictions back to 'Approved' and 'Denied' if needed
    # y_pred_labels = label_encoder.inverse_transform(y_pred)
    
    # Evaluate performance on the entire test set
    accuracy = accuracy_score(y_test, y_pred)
    print("\nOverall Model Performance:")
    print(f"Accuracy for {model_name}: {accuracy * 100:.2f}%")
    
    # Evaluate the model's performance
    print("Classification Report:")
    print(classification_report(y_test, y_pred))  # Using numeric labels
    print("Predicted labels:", y_pred[:10]) 

    # Bias/Fairness Evaluation: Check classification reports for males and females
    male_indices = (X_test['Gender'] == 'Male').values  # Adjust as per your data format
    female_indices = (X_test['Gender'] == 'Female').values

    y_pred_male = model.predict(X_test[male_indices])
    y_true_male = y_test[male_indices]
    y_pred_female = model.predict(X_test[female_indices])
    y_true_female = y_test[female_indices]

    print("\nBias/Fairness Evaluation:")
    print(f"Male Classification Report for {model_name}:")
    print(classification_report(y_true_male, y_pred_male))
    print(f"Female Classification Report for {model_name}:")
    print(classification_report(y_true_female, y_pred_female))

    # Variance: Compare training and test performance
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nVariance Check:")
    print(f"Training Accuracy for {model_name}: {train_accuracy * 100:.2f}%")
    print(f"Test Accuracy for {model_name}: {accuracy * 100:.2f}%")

    # Get feature importances from the model (if supported by model used)
    if hasattr(model.named_steps['ClassifierModel'], 'feature_importances_'):
        importances = model.named_steps['ClassifierModel'].feature_importances_
        feature_names = preprocessor.get_feature_names_out()
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        print(feature_importance_df)
    else:
        print(f"{model_name} does not support feature importances.")

In [18]:
train_and_evaluate("Random Forests", rfc)

Training  Random Forests

Overall Model Performance:
Accuracy for Random Forests: 96.47%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     30931
           1       0.97      0.97      0.97     49069

    accuracy                           0.96     80000
   macro avg       0.96      0.96      0.96     80000
weighted avg       0.96      0.96      0.96     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for Random Forests:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     22910
           1       0.96      0.96      0.96     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for Random Forests:
              precision    recall  f1-score   support

           0    

In [19]:
# train_and_evaluate("Support Vector Machine", svm)

In [48]:
train_and_evaluate("Logistic Regression", log_reg)

Training  Logistic Regression
Trained pipeline saved as logistic_regression_pipeline.pkl

Overall Model Performance:
Accuracy for Logistic Regression: 97.25%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     30931
           1       0.98      0.98      0.98     49069

    accuracy                           0.97     80000
   macro avg       0.97      0.97      0.97     80000
weighted avg       0.97      0.97      0.97     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     22910
           1       0.97      0.97      0.97     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for Logistic Regr

In [21]:
train_and_evaluate("XGBoost", xgb)

Training  XGBoost


Parameters: { "use_label_encoder" } are not used.




Overall Model Performance:
Accuracy for XGBoost: 97.24%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     30931
           1       0.98      0.98      0.98     49069

    accuracy                           0.97     80000
   macro avg       0.97      0.97      0.97     80000
weighted avg       0.97      0.97      0.97     80000

Predicted labels: [1 1 1 1 1 0 0 0 1 0]

Bias/Fairness Evaluation:
Male Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     22910
           1       0.97      0.97      0.97     17045

    accuracy                           0.97     39955
   macro avg       0.97      0.97      0.97     39955
weighted avg       0.97      0.97      0.97     39955

Female Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      8021
        

In [66]:
# Load the pipeline
pipeline = joblib.load('logistic_regression_pipeline.pkl')

# Example of using the loaded pipeline for prediction
new_data = pd.DataFrame({
    'Num_Children': [2],
    'Gender': ['Male'],
    'Income': [92531],
    'Own_Car': ['No'],
    'Own_Housing': ['No']
})

predictions = pipeline.predict(new_data)
print("Predictions:", predictions)

Predictions: [0]
