In [35]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression , Ridge

from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('Churn_Modelling.csv')
df = data.copy()

# Separate features and target
X = df.drop(['Exited','Balance'], axis=1)



# Exclude ID columns (not predictive features)
id_columns = ['RowNumber', 'CustomerId','Surname']
X = X.drop(id_columns, axis=1, errors='ignore')



# Select columns from X only (excluding target and IDs)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()







categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])



numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[

        ('cat', categorical_pipeline , categorical_cols),
        
        ('num', numerical_pipeline, numerical_cols)
    ]
)




In [36]:
y_class = df['Exited']


model_classifier = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(
        LogisticRegression(
            penalty='l1',
            solver='liblinear',
            C=1.0,
            max_iter=1000,
            random_state=42
        )
    )),
    ('classifier', LogisticRegression(
        penalty='l2',
        max_iter=1000,
        random_state=42
    ))
])


model_classifier.fit(X, y_class)




In [37]:
y_reg = df['Balance']


model_regression = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(
        Ridge(alpha=1.0),
        threshold='median'
    )),
    ('regressor', Ridge(alpha=1.0))
])

model_regression.fit(X, y_reg)


In [38]:
def get_transformed_X(model, X):
    X_pre = model.named_steps['preprocessor'].transform(X)
    X_selected = model.named_steps['feature_selection'].transform(X_pre)
    return X_selected


In [39]:
Xt_class = get_transformed_X(model_classifier, X)
Xt_reg   = get_transformed_X(model_regression, X)


In [40]:
Xt_class[0]


array([ 0.        ,  0.        , -0.32622142,  0.29351742, -1.04175968,
       -0.91158349,  0.64609167,  0.97024255,  0.02188649])

In [41]:
Xt_reg

array([[ 0.        ,  0.29351742, -0.91158349,  0.64609167,  0.02188649],
       [ 2.        ,  0.19816383, -0.91158349, -1.54776799,  0.21653375],
       [ 0.        ,  0.29351742,  2.52705662,  0.64609167,  0.2406869 ],
       ...,
       [ 0.        , -0.27860412, -0.91158349, -1.54776799, -1.00864308],
       [ 1.        ,  0.29351742,  0.80773656,  0.64609167, -0.12523071],
       [ 0.        , -1.04143285, -0.91158349,  0.64609167, -1.07636976]])