# Classifier
- Pipeline
- make_pipeline
- VotingClassifier
- ColumnTransformer
- make_column_transformer
- make_column_selector

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create the ColumnTransformer
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),
    (categorical_transformer, make_column_selector(dtype_include=object))
)

# Define the classifiers
clf1 = RandomForestClassifier(n_estimators=50, random_state=42)
clf2 = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
clf3 = SVC(kernel='rbf', probability=True, random_state=42)

# Create the VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', clf1),
        ('lr', clf2),
        ('svc', clf3)
    ],
    voting='soft'
)

# Create the final pipeline
pipeline = make_pipeline(preprocessor, voting_clf)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


## Another dataset with customizing any of the steps in the pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Titanic dataset
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
data = pd.read_csv(url)

# Prepare the dataset
# Drop unnecessary columns and rows with missing target
data = data.drop(columns=['Name', 'Ticket', 'Cabin'])
data = data.dropna(subset=['Survived'])

# Separate features and target
X = data.drop(columns='Survived')
y = data['Survived']

# Fill missing values
X['Age'].fillna(X['Age'].median(), inplace=True)
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create the ColumnTransformer
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),
    (categorical_transformer, make_column_selector(dtype_include=object))
)

# Define the classifiers
clf1 = RandomForestClassifier(n_estimators=50, random_state=42)
clf2 = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
clf3 = SVC(kernel='rbf', probability=True, random_state=42)

# Create the VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', clf1),
        ('lr', clf2),
        ('svc', clf3)
    ],
    voting='soft'
)

# Create the final pipeline
pipeline = make_pipeline(preprocessor, voting_clf)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


# Regressor
- Pipeline
- make_pipeline
- VotingRegressor 
- ColumnTransformer
- make_column_transformer
- make_column_selector

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Since the California housing dataset only has numerical features, we'll simulate a categorical column
# Adding a synthetic categorical column for demonstration
X_train['SyntheticCat'] = np.random.choice(['A', 'B', 'C'], size=X_train.shape[0])
X_test['SyntheticCat'] = np.random.choice(['A', 'B', 'C'], size=X_test.shape[0])

# Create the ColumnTransformer
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),
    (categorical_transformer, make_column_selector(dtype_include=object))
)

# Define the regressors
reg1 = RandomForestRegressor(n_estimators=50, random_state=42)
reg2 = LinearRegression()
reg3 = SVR(kernel='rbf')

# Create the VotingRegressor
voting_reg = VotingRegressor(
    estimators=[
        ('rf', reg1),
        ('lr', reg2),
        ('svr', reg3)
    ]
)

# Create the final pipeline
pipeline = make_pipeline(preprocessor, voting_reg)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')


## Another dataset with customizing any of the steps in the pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target

# Simulate a categorical column for demonstration
X['CHAS'] = X['CHAS'].astype(str)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create the ColumnTransformer
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),
    (categorical_transformer, make_column_selector(dtype_include=object))
)

# Define the regressors
reg1 = RandomForestRegressor(n_estimators=50, random_state=42)
reg2 = LinearRegression()
reg3 = SVR(kernel='rbf')

# Create the VotingRegressor
voting_reg = VotingRegressor(
    estimators=[
        ('rf', reg1),
        ('lr', reg2),
        ('svr', reg3)
    ]
)

# Create the final pipeline
pipeline = make_pipeline(preprocessor, voting_reg)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')
