In [26]:
import os

In [27]:
%pwd

'/Users/suyash/Desktop/End to End'

In [4]:
os.chdir('../')
%pwd

'/Users/suyash/Desktop/End to End'

In [28]:
import pandas as pd

In [29]:
from dataclasses import dataclass
from pathlib import Path    


@dataclass
class DataSplitterConfig:  
    root_dir: Path
    train_data: Path
    test_data: Path
    split_ratio: float
    random_state: int
    unzip_data_dir: Path 
    categorical_columns: list  # Updated name to match schema
    numerical_columns: list   # Updated name to match schema

In [30]:
from src.IPL_Predictor.constants import *
from src.IPL_Predictor.utils.common import read_yaml, create_directories
from src.IPL_Predictor import logger

In [31]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_split_config(self) -> DataSplitterConfig:
        config = self.config.data_splitter
        create_directories([config.root_dir])

        data_splitter_config = DataSplitterConfig(
            root_dir=config.root_dir,
            train_data=config.train_data,
            test_data=config.test_data,
            split_ratio=config.split_ratio,
            random_state=config.random_state,
            unzip_data_dir=config.unzip_data_dir,
            categorical_columns=self.schema['Categorial_COLUMNS'],  # Match schema name
            numerical_columns=self.schema['Numerical_COLUMNS']      # Match schema name
        )
        return data_splitter_config

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd


In [33]:
class Data_Splitting:
    def __init__(self, config: DataSplitterConfig, schema=None):
        self.config = config
        self.schema = schema

    def split_data(self):
        try:
            # Read the data
            df = pd.read_csv(self.config.unzip_data_dir)
            logger.info(f"Read data with shape: {df.shape}")
            
            # Save target column before preprocessing
            target = df[self.schema.TARGET_COLUMN]
            
            # Split data first
            train_set, test_set = train_test_split(
                df, 
                test_size=self.config.split_ratio, 
                random_state=self.config.random_state,
                stratify=target
            )
            
            # Save target columns separately
            train_target = train_set[self.schema.TARGET_COLUMN]
            test_target = test_set[self.schema.TARGET_COLUMN]
            
            # Create and fit preprocessor for features only
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", StandardScaler(), self.config.numerical_columns),
                    ("cat", OneHotEncoder(drop='first', sparse_output=False), self.config.categorical_columns)
                ],
                remainder='drop'
            )
            
            # Process features
            train_processed = preprocessor.fit_transform(train_set[self.config.numerical_columns + self.config.categorical_columns])
            test_processed = preprocessor.transform(test_set[self.config.numerical_columns + self.config.categorical_columns])
            
            # Get feature names
            num_features = self.config.numerical_columns
            cat_features = []
            for i, col in enumerate(self.config.categorical_columns):
                categories = preprocessor.named_transformers_['cat'].categories_[i][1:]
                cat_features.extend([f"{col}_{cat}" for cat in categories])
            
            # Create DataFrames with proper column names
            train_df = pd.DataFrame(
                train_processed,
                columns=num_features + cat_features
            )
            test_df = pd.DataFrame(
                test_processed,
                columns=num_features + cat_features
            )
            
            # Add target column back
            train_df[self.schema.TARGET_COLUMN] = train_target.values
            test_df[self.schema.TARGET_COLUMN] = test_target.values
            
            # Save preprocessed data
            os.makedirs(os.path.dirname(self.config.train_data), exist_ok=True)
            train_df.to_csv(self.config.train_data, index=False)
            test_df.to_csv(self.config.test_data, index=False)
            
            logger.info(f"Preprocessed data saved to {self.config.root_dir}")
            logger.info(f"Processed train shape: {train_df.shape}, Processed test shape: {test_df.shape}")
            
            return train_df, test_df
            
        except Exception as e:
            logger.error(f"Error in data splitting: {str(e)}")
            raise e

In [34]:
try:
    config = ConfigurationManager()
    data_split_config = config.get_data_split_config()
    data_splitter = Data_Splitting(
        config=data_split_config,
        schema=config.schema  # Pass the schema from ConfigurationManager
    )
    data_splitter.split_data()
except Exception as e:
    logger.exception(e)
    raise e

[2025-09-24 01:02:24,215: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-09-24 01:02:24,216: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-24 01:02:24,217: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-09-24 01:02:24,218: INFO: common: created directory at : artifacts]
[2025-09-24 01:02:24,218: INFO: common: created directory at : artifacts/data_splitting]
[2025-09-24 01:02:24,376: INFO: 1287289255: Read data with shape: (250331, 14)]
[2025-09-24 01:02:29,570: INFO: 1287289255: Preprocessed data saved to artifacts/data_splitting]
[2025-09-24 01:02:29,572: INFO: 1287289255: Processed train shape: (200264, 71), Processed test shape: (50067, 71)]
