# make_column_transformer
The make_column_transformer function in scikit-learn is a convenient function for creating a ColumnTransformer without explicitly specifying the names for each transformer. This function simplifies the creation of a ColumnTransformer by reducing boilerplate code.

## How It Works
1. Initialization: The ColumnTransformer is initialized with a list of tuples. Each tuple contains:
- The transformer object (e.g., StandardScaler, OneHotEncoder).
- The column(s) to which the transformer should be applied.
2. Fitting and Transforming: Similar to ColumnTransformer, it has fit, transform, and fit_transform methods. It fits each transformer to the specified columns and then transforms the columns accordingly.

## Simple Example

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

# Load dataset
data = fetch_openml(name='titanic', version=1, as_frame=True)
X = data.data[['age', 'fare', 'embarked', 'sex']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), ['age', 'fare']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['embarked', 'sex'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)


# Complex Example

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

# Load dataset
data = fetch_openml(name='adult', version=2, as_frame=True)
X = data.data[['age', 'education-num', 'hours-per-week', 'workclass', 'occupation', 'sex']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='median')), ('poly', PolynomialFeatures(degree=2)), ('scaler', StandardScaler())]), ['age', 'education-num', 'hours-per-week']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['workclass', 'occupation', 'sex'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100))])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)


# Very Complex Example

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import fetch_openml

# Load dataset
data = fetch_openml(name='bank-marketing', version=1, as_frame=True)
X = data.data[['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=chi2, k=10)),
    ('classifier', GradientBoostingClassifier())
])

# Grid search for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__max_depth': [3, 5]
}

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict
predictions = best_model.predict(X_test)


# make column transformer with Pipeline

In [None]:
# File: test_preprocessing.py
import unittest
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Sample data
data = pd.DataFrame({
    'age': [25, np.nan, 35, np.nan, 40],
    'fare': [7.25, 71.83, 8.05, np.nan, 15.5],
    'embarked': ['S', 'C', np.nan, 'Q', 'S'],
    'sex': ['male', 'female', 'female', np.nan, 'male']
})

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (numerical_transformer, ['age', 'fare']),
    (categorical_transformer, ['embarked', 'sex'])
)

# Test numerical imputation
transformed_data = preprocessor.fit_transform(data)
assert not np.any(np.isnan(transformed_data[:, :2])), "Numerical imputation failed"

# Test categorical imputation
assert np.all(transformed_data[:, 2:5].sum(axis=1) > 0), "Categorical imputation failed"

# Test numerical scaling
age_scaled = transformed_data[:, 0]
assert np.isclose(np.mean(age_scaled), 0, atol=1e-6), "Numerical scaling failed"

print("All tests passed successfully.")

# Evaluating Models Using Precision, Recall, and F1-Score

## Simple Example

In [None]:
# File: main_model_simple.py

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report

# Load dataset
data = fetch_openml(name='titanic', version=1, as_frame=True)
X = data.data[['age', 'fare', 'embarked', 'sex']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), ['age', 'fare']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['embarked', 'sex'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


## Complex Example

In [None]:
# File: main_model_complex.py

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report

# Load dataset
data = fetch_openml(name='adult', version=2, as_frame=True)
X = data.data[['age', 'education-num', 'hours-per-week', 'workclass', 'occupation', 'sex']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='median')), ('poly', PolynomialFeatures(degree=2)), ('scaler', StandardScaler())]), ['age', 'education-num', 'hours-per-week']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['workclass', 'occupation', 'sex'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100))])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


## Very Complex Example

In [None]:
# File: main_model_very_complex.py

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report

# Load dataset
data = fetch_openml(name='bank-marketing', version=1, as_frame=True)
X = data.data[['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']]
y = data.target

# Create the ColumnTransformer using make_column_transformer
preprocessor = make_column_transformer(
    (Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']),
    (Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'])
)

# Combine preprocessor and classifier in a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=chi2, k=10)),
    ('classifier', GradientBoostingClassifier())
])

# Grid search for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__max_depth': [3, 5]
}

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict and evaluate
predictions = best_model.predict(X_test)
print(classification_report(y_test, predictions))
