In [1]:
import os

In [2]:
%pwd

'd:\\Data Science\\Customer Churn Prediction with MLOps\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Data Science\\Customer Churn Prediction with MLOps'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib
from mlProject import logger
from dataclasses import dataclass

In [46]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from mlProject import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def transform_data(self, data):
        """Applies preprocessing transformations to the dataset."""
        
        # Convert TotalCharges to numeric
        data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

        # Define target and features
        X = data.drop(columns=['customerID', 'Churn'])  # Drop non-useful and target column
        y = data['Churn'].map({'No': 0, 'Yes': 1})  # Convert Churn to 0 & 1

        # imputer = SimpleImputer(strategy='most_frequent')  # Fill NaN with the most frequent value (0 or 1)
        # y = imputer.fit_transform(y.values.reshape(-1, 1)).ravel().astype(int)  # Ensure y is an integer
        


        # Define column categories
        numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
        binary_categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
        multi_categorical_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                                      'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                                      'Contract', 'PaymentMethod']

        # Define preprocessing steps
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Fill missing values
            ('scaler', StandardScaler())  # Standardize numerical data
        ])

        binary_transformer = Pipeline(steps=[
            ('encoder', OneHotEncoder(drop='if_binary', dtype=int))  # Convert binary to 0 & 1
        ])

        multi_transformer = Pipeline(steps=[
            ('encoder', OneHotEncoder(drop='first', dtype=int))  # One-Hot Encode multi-category columns
        ])

        # Combine all transformations
        preprocessor = ColumnTransformer(transformers=[
            ('num', numeric_transformer, numeric_features),
            ('bin', binary_transformer, binary_categorical_features),
            ('multi', multi_transformer, multi_categorical_features)
        ])

        # Apply transformations
        X_transformed = preprocessor.fit_transform(X)

        # Convert transformed data into a DataFrame
        feature_names = preprocessor.get_feature_names_out()
        X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

        return X_transformed_df, y

    def train_test_splitting(self):
        """Splits the dataset into train and test sets after transformation."""
        # Load raw data
        data = pd.read_csv(self.config.data_path)

        # Apply transformations
        X_transformed, y = self.transform_data(data)

        # Train-test split (75% train, 25% test)
        X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.25, random_state=42)

        # y_train = pd.Series(y_train)
        # y_test = pd.Series(y_test)

        # y_train = pd.Series(y_train).reset_index(drop=True)
        # y_test = pd.Series(y_test).reset_index(drop=True)


        # Save transformed train and test data
        train_data = pd.concat([X_train, y_train], axis=1)
        test_data = pd.concat([X_test, y_test], axis=1)

        train_path = os.path.join(self.config.root_dir, "train.csv")
        test_path = os.path.join(self.config.root_dir, "test.csv")

        train_data.to_csv(train_path, index=False)
        test_data.to_csv(test_path, index=False)

        logger.info("Data transformed and split into training and test sets")
        logger.info(f"Train shape: {train_data.shape}")
        logger.info(f"Test shape: {test_data.shape}")

        print("Train shape:", train_data.shape)
        print("Test shape:", test_data.shape)


In [47]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2025-02-06 15:30:05,761: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-06 15:30:05,762: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-06 15:30:05,765: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-02-06 15:30:05,766: INFO: common: created directory at: artifacts]
[2025-02-06 15:30:05,766: INFO: common: created directory at: artifacts/data_transformation]
[2025-02-06 15:30:05,913: INFO: 3404203253: Data transformed and split into training and test sets]
[2025-02-06 15:30:05,914: INFO: 3404203253: Train shape: (5282, 30)]
[2025-02-06 15:30:05,914: INFO: 3404203253: Test shape: (1761, 30)]
Train shape: (5282, 30)
Test shape: (1761, 30)


In [None]:
Train shape: (6621, 30)
Test shape: (3047, 30)