In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import mysql.connector
import warnings
warnings.filterwarnings("ignore")

 
def load_data():
    try:
        conn = mysql.connector.connect(
            host="localhost",
            user="root",
            password="",       # your password if set
            database="placement"
        )
        query = "SELECT * FROM customers;"
        df = pd.read_sql(query, conn)
        conn.close()
        print("Data Loaded Successfully!")
        return df
    except Exception as e:
        print("Error loading data:", e)

df = load_data()
df.head()


Data Loaded Successfully!


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,SatisfactionScore,CardType,PointEarned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [16]:

drop_cols = ["RowNumber", "CustomerId", "Surname"]

df = df.drop(columns=drop_cols)

df.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,SatisfactionScore,CardType,PointEarned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [17]:

print(df.isnull().sum())
df = df.fillna(df.median(numeric_only=True))
df = df.fillna(df.mode().iloc[0])


CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Complain             0
SatisfactionScore    0
CardType             0
PointEarned          0
dtype: int64


In [18]:

categorical_cols = ["Geography", "Gender", "Complain", "CardType"]
numeric_cols = df.drop(columns=categorical_cols + ["Exited"]).columns

print("Categorical Columns:", categorical_cols)
print("Numeric Columns:", numeric_cols)


Categorical Columns: ['Geography', 'Gender', 'Complain', 'CardType']
Numeric Columns: Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'SatisfactionScore',
       'PointEarned'],
      dtype='object')


In [19]:

X = df.drop(columns=["Exited"])
y = df["Exited"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (10000, 14)
y shape: (10000,)


In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop='first'), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ] 
)
print("Preprocessing Pipeline Ready!")


Preprocessing Pipeline Ready!


In [21]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (8000, 14)
Test shape: (2000, 14)


In [22]:

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed Train Shape:", X_train_processed.shape)
print("Processed Test Shape:", X_test_processed.shape)


Processed Train Shape: (8000, 17)
Processed Test Shape: (2000, 17)


In [23]:

import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")
joblib.dump(X_train_processed, "../data/processed/X_train.pkl")
joblib.dump(X_test_processed, "../data/processed/X_test.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")

print("Preprocessing Complete & Saved!")


Preprocessing Complete & Saved!
