ETL Pipeline on Uploaded data.csv

In [2]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [5]:
# Read &  Extract Dataset
print('📥 Loading dataset...')
data = pd.read_csv('data.csv')
print(f"Initial Shape: {data.shape}")
data.head()

📥 Loading dataset...
Initial Shape: (891, 5)


Unnamed: 0,age,salary,department,class,target
0,22.0,7.25,male,Third,0
1,38.0,71.2833,female,First,1
2,26.0,7.925,female,Third,1
3,35.0,53.1,female,First,1
4,35.0,8.05,male,Third,0


BASIC DATA CHECKS

In [None]:


# Check data types
print("\nData Types:\n", data.dtypes)

# Preview statistical summary
data.describe(include="all")

Missing value : 
 age           177
salary          0
department      0
class           0
target          0
dtype: int64

Data Types:
 age           float64
salary        float64
department     object
class          object
target          int64
dtype: object


Unnamed: 0,age,salary,department,class,target
count,714.0,891.0,891,891,891.0
unique,,,2,3,
top,,,male,Third,
freq,,,577,491,
mean,29.699118,32.204208,,,0.383838
std,14.526497,49.693429,,,0.486592
min,0.42,0.0,,,0.0
25%,20.125,7.9104,,,0.0
50%,28.0,14.4542,,,0.0
75%,38.0,31.0,,,1.0


In [10]:
# handle Missing Value in age
data['age'] = data['age'].fillna(data['age'].mean())

# again check after fill missing values
print("Missing value : \n", data.isnull().sum())


Missing value : 
 age           0
salary        0
department    0
class         0
target        0
dtype: int64


In [11]:
#DROP DUPLICATES
data.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {data.shape}")

Shape after dropping duplicates: (768, 5)


In [None]:
#IDENTIFY FEATURE TYPES
numeric_features = ['age', 'salary']  
categorical_features = ['department', 'class']

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)

Numeric Features: ['age', 'salary']
Categorical Features: ['department', 'class']


PREPROCESSING PIPELINES

In [40]:
# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine into one preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)

SPLIT FEATURES & TARGET

In [41]:
TARGET_COLUMN = "target"  # Change this if your dataset uses a different name

if TARGET_COLUMN in data.columns:
    X = data.drop(columns=[TARGET_COLUMN])
    y = data[TARGET_COLUMN]
else:
    X = data.copy()
    y = None
    print("⚠ No target column found. Processing entire dataset as features.")

In [42]:
# Train-test split only if target is available
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
else:
    X_train, X_test = X, None

In [43]:
print(X_train)

           age   salary department   class
62   45.000000  83.4750       male   First
711  29.699118  26.5500       male   First
382  32.000000   7.9250       male   Third
321  27.000000   7.8958       male   Third
250  29.699118   7.2500       male   Third
..         ...      ...        ...     ...
73   26.000000  14.4542       male   Third
113  20.000000   9.8250     female   Third
292  36.000000  12.8750       male  Second
491  21.000000   7.2500       male   Third
109  29.699118  24.1500     female   Third

[614 rows x 4 columns]


APPLY TRANSFORMATIONS

In [44]:
X_train_processed = preprocessor.fit_transform(X_train)

if X_test is not None:
    X_test_processed = preprocessor.transform(X_test)