In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression , Ridge


data = pd.read_csv('Churn_Modelling.csv')
df = data.copy()



common_targets = ['Exited', 'Profit', 'Purchased', 'Target', 'Class', 'Label', 'y','Balance']
target_col = None


for name in common_targets:
    if name in df.columns:
        target_col = name
        break

if target_col is None:
    target_col = df.columns[-1]  # fallback

y = df[target_col]

# -----------------------------
# Drop ID-like columns
# -----------------------------
id_patterns = ['RowNumber', 'CustomerId', 'CustomerID', 'Surname', 'Id', 'ID', 'index']
id_columns = [c for c in id_patterns if c in df.columns]

# -----------------------------
# Feature matrix
# -----------------------------
drop_cols = [target_col] + id_columns
X = df.drop(columns=drop_cols, errors='ignore')



# -----------------------------
# Column type detection
# -----------------------------
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


# -----------------------------
# Pipelines
# -----------------------------
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# -----------------------------
# Preprocessor
# -----------------------------
transformers = []
if categorical_cols:
    transformers.append(('cat', categorical_pipeline, categorical_cols))
if numerical_cols:
    transformers.append(('num', numerical_pipeline, numerical_cols))

preprocessor = ColumnTransformer(transformers=transformers)



# Decide task type
is_classification = (
    y.dtype.kind in 'biu' and y.nunique() <= 10
)





if is_classification:
    print("Task detected: CLASSIFICATION")

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectFromModel(
            LogisticRegression(
                penalty='l1',
                solver='liblinear',
                C=1.0,
                max_iter=1000,
                random_state=42
            )
        )),
        ('classifier', LogisticRegression(
            penalty='l2',
            max_iter=1000,
            random_state=42
        ))
    ])




else:
    print("Task detected: REGRESSION")



    model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(
        Ridge(alpha=1.0),
        threshold='median'
    )),
    ('regressor', Ridge(alpha=1.0))
])



model.fit(X, y)



def get_transformed_X(model, X):
    X_pre = model.named_steps['preprocessor'].transform(X)
    X_selected = model.named_steps['feature_selection'].transform(X_pre)
    return X_selected


Xt = get_transformed_X(model, X)


Task detected: CLASSIFICATION


In [2]:
Xt[0]


array([ 0.        ,  0.        , -0.32622142,  0.29351742, -1.04175968,
       -1.22584767, -0.91158349,  0.64609167,  0.97024255,  0.02188649])

In [3]:
feature_names = model.named_steps['preprocessor'].get_feature_names_out()


In [13]:
feature_names

array(['cat__Geography', 'cat__Gender', 'num__CreditScore', 'num__Age',
       'num__Tenure', 'num__Balance', 'num__NumOfProducts',
       'num__HasCrCard', 'num__IsActiveMember', 'num__EstimatedSalary'],
      dtype=object)

In [14]:
selector = model.named_steps['feature_selection']
selected_features = feature_names[selector.get_support()]


In [15]:
selected_features

array(['cat__Geography', 'cat__Gender', 'num__CreditScore', 'num__Age',
       'num__Tenure', 'num__Balance', 'num__NumOfProducts',
       'num__HasCrCard', 'num__IsActiveMember', 'num__EstimatedSalary'],
      dtype=object)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer


def build_preprocessor(X):
    """
    Builds a robust ColumnTransformer that works with:
    - only numerical columns
    - only categorical columns
    - mixed datasets
    """

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

    transformers = []

    if categorical_cols:
        categorical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(
                handle_unknown='use_encoded_value',
                unknown_value=-1
            ))
        ])
        transformers.append(('cat', categorical_pipeline, categorical_cols))

    if numerical_cols:
        numerical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        transformers.append(('num', numerical_pipeline, numerical_cols))

    if not transformers:
        raise ValueError("No valid features found for preprocessing")

    return ColumnTransformer(transformers)
