In [2]:
import os

In [3]:
%pwd

'/Users/suyash/Desktop/End to End/research'

In [4]:
os.chdir('../')

In [5]:
import pandas as pd

In [25]:
from dataclasses import dataclass
from pathlib import Path    


@dataclass
class DataSplitterConfig:  
    root_dir: Path
    train_data: Path
    test_data: Path
    split_ratio: float
    random_state: int
    unzip_data_dir: Path 
    categorical_columns: list  # Updated name to match schema
    numerical_columns: list   # Updated name to match schema

In [20]:
from src.IPL_Predictor.constants import *
from src.IPL_Predictor.utils.common import read_yaml, create_directories
from src.IPL_Predictor import logger

In [26]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_split_config(self) -> DataSplitterConfig:
        config = self.config.data_splitter
        create_directories([config.root_dir])

        data_splitter_config = DataSplitterConfig(
            root_dir=config.root_dir,
            train_data=config.train_data,
            test_data=config.test_data,
            split_ratio=config.split_ratio,
            random_state=config.random_state,
            unzip_data_dir=config.unzip_data_dir,
            categorical_columns=self.schema['Categorial_COLUMNS'],  # Match schema name
            numerical_columns=self.schema['Numerical_COLUMNS']      # Match schema name
        )
        return data_splitter_config

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd


In [28]:
class Data_Splitting:
    def __init__(self, config: DataSplitterConfig):
        self.config = config

    def split_data(self):
        df = pd.read_csv(self.config.unzip_data_dir)
        
        # Updated column references
        preprocessor = ColumnTransformer([
            ("num", StandardScaler(), self.config.numerical_columns),
            ("cat", OneHotEncoder(handle_unknown='ignore'), self.config.categorical_columns)
        ])
        
        # Split data
        train_set, test_set = train_test_split(
            df, 
            test_size=self.config.split_ratio, 
            random_state=self.config.random_state
        )
        
        # Save splits
        train_set.to_csv(self.config.train_data, index=False)
        test_set.to_csv(self.config.test_data, index=False)
        
        logger.info(f"Train and Test data saved in {self.config.root_dir}")
        logger.info(f"Train data shape: {train_set.shape} and Test data shape: {test_set.shape}")

In [29]:
try:
    config = ConfigurationManager()
    data_split_config = config.get_data_split_config()  # Fixed variable name
    data_splitter = Data_Splitting(config=data_split_config)  # Fixed variable name
    data_splitter.split_data()
except Exception as e:
    logger.exception(e)
    raise e

[2025-09-24 00:46:54,925: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-09-24 00:46:54,926: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-24 00:46:54,928: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-09-24 00:46:54,928: INFO: common: created directory at : artifacts]
[2025-09-24 00:46:54,928: INFO: common: created directory at : artifacts/data_splitting]
[2025-09-24 00:46:55,764: INFO: 1632313070: Train and Test data saved in artifacts/data_splitting]
[2025-09-24 00:46:55,765: INFO: 1632313070: Train data shape: (200264, 14) and Test data shape: (50067, 14)]
