##  Q1. End-to-End Pipeline with Feature Selection, Imputation, Scaling, and Random Forest

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# Load Titanic dataset
data = fetch_openml("titanic", version=1, as_frame=True)
df = data.frame

# Drop useless columns
df = df.drop(columns=['name', 'ticket', 'boat', 'body', 'home.dest'])

# Convert target to binary
df['survived'] = df['survived'].astype('int')

# Separate features and target
X = df.drop('survived', axis=1)
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.79


## Q2. Ensemble Model Using Voting Classifier (Random Forest + Logistic Regression)

In [6]:
from sklearn.datasets import load_iris
from sklearn.ensemble import VotingClassifier

# Load iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf1 = LogisticRegression(max_iter=1000, random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)

voting_clf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2)
], voting='hard')

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {acc:.2f}")


Voting Classifier Accuracy: 1.00
