In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

# Splitting Personal Status into Gender and Marital Status
def split_personal_status(status):
    parts = status.split(' ', 1)
    gender = parts[0]
    marital_status = parts[1] if len(parts) > 1 else ''
    return pd.Series([gender, marital_status])

# Feature Engineering 
def engineer_features(df):
    df = df.copy()

    # Droping Customer ID - not required for training
    if 'Customer_ID' in df.columns:
        df = df.drop(columns=['Customer_ID'])

    # Personal Status Splitting
    df[['gender', 'marital_status']] = df['personal_status'].apply(split_personal_status)
    df.drop(columns=['personal_status'], inplace=True)

    # Label Encoding for JOB
    job_map = {
        'high qualif/self emp/mgmt': 4,
        'skilled': 3,
        'unskilled resident': 2,
        'unemp/unskilled non res': 1
    }

    if 'job' in df.columns:
        df['job'] = df['job'].map(job_map)

    if 'credit_amount' in df.columns and 'job' in df.columns:
        df['credit_job_ratio'] = df['credit_amount'] / df['job'].replace(0, 1)

    if 'age' in df.columns:
        df['credit_age_ratio'] = df['credit_amount'] / df['age']

    if 'duration' in df.columns:
        df['monthly_burden'] = df['credit_amount'] / df['duration']

    if 'installment_commitment' in df.columns and 'existing_credits' in df.columns:
        df['debt_burden'] = df['installment_commitment'] * df['existing_credits']

    return df

In [2]:
# Building Preprocessor
def build_preprocessor(df, encoding_method='onehot', drop_first=False, handle_unknown='ignore'):
    """Create preprocessing transformer and extract transformed feature names."""

    # Label Encoding for Binary labelled columns 
    binary_mappings = {
        'gender': {'male': 1, 'female': 0},
        'own_telephone': {'yes': 1, 'none': 0},
        'foreign_worker': {'yes': 1, 'no': 0},
        'class': {'good': 1, 'bad': 0}
    }

    binary_cols = ['own_telephone', 'foreign_worker', 'class', 'gender']
    multi_category_cols = [
        'checking_status', 'credit_history', 'purpose', 'savings_status',
        'employment', 'other_parties', 'property_magnitude',
        'other_payment_plans', 'housing', 'marital_status'
    ]
    numerical_cols = [
        'duration', 'credit_amount', 'installment_commitment',
        'residence_since', 'age', 'existing_credits','job', 'num_dependents',
        'credit_job_ratio', 'credit_age_ratio', 'monthly_burden', 'debt_burden'
    ]

    # Confirming the Columns in the Received Dataframe
    numerical_cols = [col for col in numerical_cols if col in df.columns]
    multi_category_cols = [col for col in multi_category_cols if col in df.columns]
    binary_cols = [col for col in binary_cols if col in df.columns]

    for col in binary_cols:
        df[col] = df[col].map(binary_mappings.get(col, {}))

    # Log Transformation
    for col in ['credit_amount', 'age', 'credit_job_ratio', 'credit_age_ratio', 'monthly_burden']:
        if col in df.columns:
            df[col] = np.log1p(df[col])

    # Standerdization
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    # One Hot Encoding for Categorical Columns
    if encoding_method == 'onehot':
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(drop='first' if drop_first else None,
                                     handle_unknown=handle_unknown,
                                     sparse_output=False))
        ])
    else:
        raise ValueError("Only onehot encoding is supported.")

    # Building Preprocessor 
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, multi_category_cols)
    ], remainder='passthrough')

    return preprocessor, numerical_cols, multi_category_cols, binary_cols

In [3]:
# Full Pipeline For preprocessing New Data
def fit_full_pipeline(df, top_k=60, p_thresh=0.05):
    """Fit the full pipeline and return transformed data + reusable preprocessor."""
    df = engineer_features(df)

    # Preprocessor with Column Transformation
    preprocessor, num_cols, cat_cols, bin_cols = build_preprocessor(df)

    X_transformed = preprocessor.fit_transform(df)
    feature_names = num_cols.copy()

    onehot = preprocessor.named_transformers_['cat']['onehot']
    if hasattr(onehot, 'get_feature_names_out'):
        feature_names += list(onehot.get_feature_names_out(cat_cols))
    else:
        feature_names += list(onehot.get_feature_names(cat_cols))

    feature_names += bin_cols

    transformed_df = pd.DataFrame(X_transformed, columns=feature_names, index=df.index)

    # Chi-square feature selection
    X = transformed_df.drop(columns=['class'])
    y = transformed_df['class']

    neg_cols = X.loc[:, (X < 0).any()].columns
    pos_cols = X.loc[:, (X >= 0).all()].columns

    k_best = min(top_k, len(pos_cols))
    chi2_selector = SelectKBest(chi2, k=k_best)
    chi2_selector.fit(X[pos_cols], y)

    chi2_df = pd.DataFrame({
        "Feature": pos_cols,
        "Chi2 Score": chi2_selector.scores_,
        "P-Value": chi2_selector.pvalues_
    })

    sig_features = chi2_df[chi2_df["P-Value"] < p_thresh]["Feature"].tolist()
    selected_features = list(neg_cols) + sig_features

    final_df = pd.concat([X[selected_features], y], axis=1)

    # Store feature list in preprocessor
    preprocessor.chi_square_selected_features = selected_features
    preprocessor.all_transformed_features = feature_names

    return final_df, preprocessor

In [4]:
# For Inference 
def apply_preprocessing(new_df, preprocessor):
    """Apply previously-fitted preprocessing on new data."""
    new_df = engineer_features(new_df)
    # print(new_df.columns)
    print(new_df.to_json(orient= 'records'))
    print(new_df.shape, new_df.columns)

    binary_mappings = {
        'gender': {'male': 1, 'female': 0},
        'own_telephone': {'yes': 1, 'none': 0},
        'foreign_worker': {'yes': 1, 'no': 0},
        'class': {'good': 1, 'bad': 0}
    }

    binary_cols = ['own_telephone', 'foreign_worker', 'class', 'gender']
    binary_cols = [col for col in binary_cols if col in new_df.columns]

    for col in binary_cols:
        new_df[col] = new_df[col].map(binary_mappings.get(col, {}))

    for col in ['credit_amount', 'age', 'credit_job_ratio', 'credit_age_ratio', 'monthly_burden']:
        if col in new_df:
            new_df[col] = np.log1p(new_df[col])

    X_transformed = preprocessor.transform(new_df)
    print(X_transformed)


    transformed_df = pd.DataFrame(X_transformed, columns=preprocessor.all_transformed_features, index=new_df.index)

    print(transformed_df)


    selected_cols = preprocessor.chi_square_selected_features
    return transformed_df[selected_cols]


# Example usage:
if __name__ == "__main__":
    raw_data = pd.read_csv("raw_data1 (1).csv")
    final_df, preprocessor = fit_full_pipeline(raw_data)

In [6]:
preprocessor

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
final_df.describe()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,job,num_dependents,credit_job_ratio,credit_age_ratio,...,employment_4<=X<7,employment_<1,property_magnitude_no known property,property_magnitude_real estate,other_payment_plans_bank,housing_for free,housing_own,housing_rent,marital_status_div/dep/mar,class
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.136868e-16,4.209966e-16,1.012523e-16,-1.776357e-16,1.847411e-16,-9.414691000000001e-17,5.684342e-17,-1.0658140000000001e-17,9.734435e-16,-4.209966e-16,...,0.174,0.172,0.154,0.282,0.139,0.108,0.713,0.179,0.31,0.7
std,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,...,0.379299,0.377569,0.361129,0.450198,0.34612,0.310536,0.452588,0.383544,0.462725,0.458487
min,-1.402415,-2.918436,-1.764514,-1.672459,-1.91861,-0.704926,-2.914492,-0.4282896,-2.703813,-2.869881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7386675,-0.7338481,-0.8701833,-0.7659773,-0.7634271,-0.704926,0.1469492,-0.4282896,-0.7655209,-0.701062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.2408572,-0.05118791,0.02414692,0.1405047,-0.09684687,-0.704926,0.1469492,-0.4282896,-0.05902786,-0.05010471,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.2569531,0.6421359,0.9184772,1.046987,0.709409,1.027079,0.1469492,-0.4282896,0.6604698,0.7077011,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
max,4.239436,2.6199,0.9184772,1.046987,2.664741,4.491089,1.67767,2.334869,3.788073,2.87922,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
raw_data = pd.read_csv("raw_data1 (1).csv")
processed_df = apply_preprocessing(raw_data, preprocessor)

[{"checking_status":"<0","duration":6,"credit_history":"critical\/other existing credit","purpose":"radio\/tv","credit_amount":1169,"savings_status":"no known savings","employment":">=7","installment_commitment":4,"other_parties":"none","residence_since":4,"property_magnitude":"real estate","age":67,"other_payment_plans":"none","housing":"own","existing_credits":2,"job":3,"num_dependents":1,"own_telephone":"yes","foreign_worker":"yes","class":"good","gender":"male","marital_status":"single","credit_job_ratio":389.6666666667,"credit_age_ratio":17.447761194,"monthly_burden":194.8333333333,"debt_burden":8},{"checking_status":"0<=X<200","duration":48,"credit_history":"existing paid","purpose":"radio\/tv","credit_amount":5951,"savings_status":"<100","employment":"1<=X<4","installment_commitment":2,"other_parties":"none","residence_since":2,"property_magnitude":"real estate","age":22,"other_payment_plans":"none","housing":"own","existing_credits":1,"job":3,"num_dependents":1,"own_telephone":

In [11]:
processed_df.describe()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,job,num_dependents,credit_job_ratio,credit_age_ratio,...,savings_status_no known savings,employment_4<=X<7,employment_<1,property_magnitude_no known property,property_magnitude_real estate,other_payment_plans_bank,housing_for free,housing_own,housing_rent,marital_status_div/dep/mar
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.136868e-16,4.209966e-16,1.012523e-16,-1.776357e-16,1.847411e-16,-9.414691000000001e-17,5.684342e-17,-1.0658140000000001e-17,9.734435e-16,-4.209966e-16,...,0.183,0.174,0.172,0.154,0.282,0.139,0.108,0.713,0.179,0.31
std,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,...,0.38686,0.379299,0.377569,0.361129,0.450198,0.34612,0.310536,0.452588,0.383544,0.462725
min,-1.402415,-2.918436,-1.764514,-1.672459,-1.91861,-0.704926,-2.914492,-0.4282896,-2.703813,-2.869881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7386675,-0.7338481,-0.8701833,-0.7659773,-0.7634271,-0.704926,0.1469492,-0.4282896,-0.7655209,-0.701062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.2408572,-0.05118791,0.02414692,0.1405047,-0.09684687,-0.704926,0.1469492,-0.4282896,-0.05902786,-0.05010471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.2569531,0.6421359,0.9184772,1.046987,0.709409,1.027079,0.1469492,-0.4282896,0.6604698,0.7077011,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
max,4.239436,2.6199,0.9184772,1.046987,2.664741,4.491089,1.67767,2.334869,3.788073,2.87922,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
final_df.describe()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,job,num_dependents,credit_job_ratio,credit_age_ratio,...,employment_4<=X<7,employment_<1,property_magnitude_no known property,property_magnitude_real estate,other_payment_plans_bank,housing_for free,housing_own,housing_rent,marital_status_div/dep/mar,class
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.136868e-16,4.209966e-16,1.012523e-16,-1.776357e-16,1.847411e-16,-9.414691000000001e-17,5.684342e-17,-1.0658140000000001e-17,9.734435e-16,-4.209966e-16,...,0.174,0.172,0.154,0.282,0.139,0.108,0.713,0.179,0.31,0.7
std,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,...,0.379299,0.377569,0.361129,0.450198,0.34612,0.310536,0.452588,0.383544,0.462725,0.458487
min,-1.402415,-2.918436,-1.764514,-1.672459,-1.91861,-0.704926,-2.914492,-0.4282896,-2.703813,-2.869881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7386675,-0.7338481,-0.8701833,-0.7659773,-0.7634271,-0.704926,0.1469492,-0.4282896,-0.7655209,-0.701062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.2408572,-0.05118791,0.02414692,0.1405047,-0.09684687,-0.704926,0.1469492,-0.4282896,-0.05902786,-0.05010471,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.2569531,0.6421359,0.9184772,1.046987,0.709409,1.027079,0.1469492,-0.4282896,0.6604698,0.7077011,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
max,4.239436,2.6199,0.9184772,1.046987,2.664741,4.491089,1.67767,2.334869,3.788073,2.87922,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
import joblib

# Save preprocessor to a .pkl file
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [17]:
# Saving the preprocesssed data to train
final_df.to_csv("final_dataset.csv", index = False)