# Logistic Regression Pipeline for German Credit Decisioning Model

Include ALL features - incl discriminative ones.

In next, only include four features with AUC scores >0.1 away from 0.5

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from ucimlrepo import fetch_ucirepo

In [12]:
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 

metadata = statlog_german_credit_data.metadata
variable_info = statlog_german_credit_data.variables

"""# metadata 
print("metadata:\n")
display(metadata)
  
# variable information 
print("variable information:\n")
display(variable_info) """


'# metadata \nprint("metadata:\n")\ndisplay(metadata)\n\n# variable information \nprint("variable information:\n")\ndisplay(variable_info) '

In [13]:
# data (as pandas dataframes) 
df = statlog_german_credit_data.data

X = df.features 
y = df.targets

In [14]:
# Separate numeric and categorical columns

# Select columns that are integers in variable_info['type] 
numeric_cols = variable_info[variable_info['type'] == 'Integer']['name'].tolist()
categorical_cols = variable_info[variable_info['type'] == 'Categorical']['name'].tolist()
binary_cols = variable_info[variable_info['type'] == 'Binary']['name'].tolist()

# remove class from binary_cols
binary_cols.remove('class')

# Add binary columns to categorical columns
categorical_cols += binary_cols

#numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
#categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['Attribute2', 'Attribute5', 'Attribute8', 'Attribute11', 'Attribute13', 'Attribute16', 'Attribute18']
Categorical columns: ['Attribute1', 'Attribute3', 'Attribute4', 'Attribute6', 'Attribute7', 'Attribute9', 'Attribute10', 'Attribute12', 'Attribute14', 'Attribute15', 'Attribute17', 'Attribute19', 'Attribute20']


In [15]:
# Move Attribute8, Attribute11, Attribute16 and Attribute18 from numeric to categorical
# Convert the columns to categorical
few_count_integer_attributes = ['Attribute8', 'Attribute11', 'Attribute16', 'Attribute18']
for attribute in few_count_integer_attributes:
    if attribute in numeric_cols:
        if attribute not in categorical_cols:       
            # Convert the column to categorical
            X[attribute] = pd.Categorical(X[attribute])
            # Remove the column from numeric_cols
            numeric_cols.remove(attribute)
            # Add the column to categorical_cols
            categorical_cols.append(attribute)
        else:
            continue
            print(f"Column {attribute} is already in categorical_cols")
    else:
        continue
        print(f"Column {attribute} is not in numeric_cols")

# check
#print("Expected numeric columns:",'2, 5, 13')
assert numeric_cols == ['Attribute2', 'Attribute5', 'Attribute13'], "Numeric columns do not match expected values"
#print("Numeric columns after moving:", numeric_cols)

# assert that X.columns is made up of numeric_cols and categorical_cols

# Check if any columns in X are missing from numeric_cols or categorical_cols
missing_cols_2 = set(X.columns) - set(numeric_cols) - set(categorical_cols)
if missing_cols_2:
    print("Columns in X that are not in numeric_cols or categorical_cols:\n", missing_cols_2)

# Check if the columns in X are the same as numeric_cols + categorical_col
assert set(X.columns) == set(numeric_cols + categorical_cols), "X columns do not match numeric and categorical columns"



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[attribute] = pd.Categorical(X[attribute])


## Create pipeline

In [None]:
# Import libraries needed for logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [None]:
# Create a column transformer to handle categorical and numeric features separately
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [None]:
# Define the model
model = LogisticRegression(max_iter=1000, random_state=42)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# wait - shouldn't I be using the same split from univ AUC testing in data exploration to prevent data leakage?
