Pre-processing the data before implementing the ML models

In [8]:
import pandas as pd

# Loading the dataset
data = pd.read_csv('data/bank-full.csv')

# Basic Info
data_info = {
    "columns" : data.columns.tolist(),
    "head" : data.head(),
}

data_info

{'columns': ['age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'],
 'head':   age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
 0  58;"management";"married";"tertiary";"no";2143...                                                                                                  
 1  44;"technician";"single";"secondary";"no";29;"...                                                                                                  
 2  33;"entrepreneur";"married";"secondary";"no";2...                                                                                                  
 3  47;"blue-collar";"married";"unknown";"no";1506...                                                                                                  
 4  33;"unknown";"single";"unknown";"no";1;"no";"n...             

In [9]:
# Organizing the dataset into columns
col_data = pd.read_csv('data/bank-full.csv', delimiter=';', quotechar='"')

# Displaying basic info after seperating into columns
col_data_info = {
    "columns": col_data.columns.tolist(),
    "head": col_data.head(),
}

col_data_info

{'columns': ['age',
  'job',
  'marital',
  'education',
  'default',
  'balance',
  'housing',
  'loan',
  'contact',
  'day',
  'month',
  'duration',
  'campaign',
  'pdays',
  'previous',
  'poutcome',
  'y'],
 'head':    age           job  marital  education default  balance housing loan  \
 0   58    management  married   tertiary      no     2143     yes   no   
 1   44    technician   single  secondary      no       29     yes   no   
 2   33  entrepreneur  married  secondary      no        2     yes  yes   
 3   47   blue-collar  married    unknown      no     1506     yes   no   
 4   33       unknown   single    unknown      no        1      no   no   
 
    contact  day month  duration  campaign  pdays  previous poutcome   y  
 0  unknown    5   may       261         1     -1         0  unknown  no  
 1  unknown    5   may       151         1     -1         0  unknown  no  
 2  unknown    5   may        76         1     -1         0  unknown  no  
 3  unknown    5   may    

In [10]:
# Checking for missing Values
missing_values_count = col_data.isnull().sum()
missing_values_count

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
input_path = "data/bank-full.csv"  # Adjust the path to your dataset
col_data = pd.read_csv(input_path)

# Print column names for debugging
print("Columns in dataset:", col_data.columns)

# Step 1: Replace "unknown" and -1 placeholders
col_data.replace("unknown", pd.NA, inplace=True)  # Replace "unknown" with NaN
if 'pdays' in col_data.columns:
    col_data['pdays'] = col_data['pdays'].replace(-1, pd.NA)  # Replace -1 with NaN

# Step 2: Remove unnecessary columns
if 'duration' in col_data.columns:
    col_data.drop(columns=['duration'], inplace=True)

# Step 3: Process categorical columns
categorical_columns = [
    'job', 'marital', 'education', 'default', 'housing', 'loan',
    'contact', 'month', 'poutcome', 'y'
]

label_encoders = {}
for col in categorical_columns:
    if col in col_data.columns:  # Ensure the column exists
        col_data[col] = col_data[col].astype(str).str.lower().str.strip()
        le = LabelEncoder()
        col_data[col] = le.fit_transform(col_data[col].fillna("missing"))  # Handle NaNs with "missing"
        label_encoders[col] = le

# Step 4: Process numerical columns
numerical_columns = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']

# Check for the existence of numerical columns and fill missing values
for col in numerical_columns:
    if col in col_data.columns:
        col_data[col] = col_data[col].fillna(col_data[col].mean())

# Scale numerical columns using StandardScaler
scaler = StandardScaler()
for col in numerical_columns:
    if col in col_data.columns:
        col_data[col] = scaler.fit_transform(col_data[[col]])

# Step 5: Export the preprocessed dataset
output_path = "data/bank-full-preprocessed.csv"
col_data.to_csv(output_path, index=False)

print(f"Preprocessed dataset saved to: {output_path}")

Columns in dataset: Index(['age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'], dtype='object')
Preprocessed dataset saved to: data/bank-full-preprocessed.csv
