In [None]:
"""
Q1. You are work#ng on a mach#ne learn#ng project where you have a dataset conta#n#ng numer#cal and
categor#cal features. You have #dent#f#ed that some of the features are h#ghly correlated and there are
m#ss#ng values #n some of the columns. You want to bu#ld a p#pel#ne that automates the feature
eng#neer#ng process and handles the m#ss#ng valuesD
Des#gn a p#pel#ne that #ncludes the follow#ng steps"
Use an automated feature select#on method to #dent#fy the #mportant features #n the datasetC
Create a numer#cal p#pel#ne that #ncludes the follow#ng steps"
Impute the m#ss#ng values #n the numer#cal columns us#ng the mean of the column valuesC
Scale the numer#cal columns us#ng standard#sat#onC
Create a categor#cal p#pel#ne that #ncludes the follow#ng steps"
Impute the m#ss#ng values #n the categor#cal columns us#ng the most frequent value of the columnC
One-hot encode the categor#cal columnsC
Comb#ne the numer#cal and categor#cal p#pel#nes us#ng a ColumnTransformerC
Use a Random Forest Class#f#er to bu#ld the f#nal modelC
Evaluate the accuracy of the model on the test datasetD
Note! Your solut#on should #nclude code sn#ppets for each step of the p#pel#ne, and a br#ef explanat#on of
each step. You should also prov#de an #nterpretat#on of the results and suggest poss#ble #mprovements for
the p#pel#neD
"""

In [None]:
"""
1.Feature Selection: Use an automated feature selection method such as SelectKBest or Recursive Feature Elimination (RFE) to identify the important features in the dataset.
"""

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Define feature selection and classification pipeline
feature_selection_pipeline = Pipeline([
    ('select_k_best', SelectKBest(score_func=f_classif, k=10))
])

# Select the important features from the dataset
X_train_selected = feature_selection_pipeline.fit_transform(X_train, y_train)


In [None]:
"""
2. Numerical Pipeline: Create a numerical pipeline that imputes missing values in the numerical columns with the mean of the column values and scales the numerical columns using standardization.
"""

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Define numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Apply numerical pipeline to numerical columns in dataset
numerical_features = ['numerical_feature_1', 'numerical_feature_2', ...]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features)
    ])

# Transform the numerical columns in the dataset
X_train_processed = preprocessor.fit_transform(X_train_selected)


In [None]:
"""
3. Categorical Pipeline: Create a categorical pipeline that imputes missing values in the categorical columns with the most frequent value of the column and one-hot encodes the categorical columns.
"""

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Define categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Apply categorical pipeline to categorical columns in dataset
categorical_features = ['categorical_feature_1', 'categorical_feature_2', ...]
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features)
    ])

# Transform the categorical columns in the dataset
X_train_processed = preprocessor.fit_transform(X_train_selected)


In [None]:
"""
4.Combine the Numerical and Categorical Pipelines: Combine the numerical and categorical pipelines using a ColumnTransformer.
"""

In [None]:
# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Transform the numerical and categorical columns in the dataset
X_train_processed = preprocessor.fit_transform(X_train_selected)


In [None]:
"""
5. Random Forest Classifier: Use a Random Forest Classifier to build the final model.
"""

In [None]:
# Define the Random Forest Classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10)

# Train the Random Forest Classifier model
rf_classifier.fit(X_train_processed, y_train)


In [None]:
"""
6. Model Evaluation: Evaluate the accuracy of the model on the test dataset.
"""

In [None]:
# Transform and preprocess the test dataset using the preprocessor
X_test_selected = feature_selection_pipeline.transform(X_test)
X_test_processed = preprocessor.transform(X_test_selected)

# Evaluate the model on the test dataset
accuracy = rf_classifier.score(X_test_processed, y_test)
print(f"Accuracy: {accuracy}")


In [None]:
"""
Q2. Bu#ld a p#pel#ne that #ncludes a random forest class#f#er and a log#st#c regress#on class#f#er, and then
use a vot#ng class#f#er to comb#ne the#r pred#ct#ons. Tra#n the p#pel#ne on the #r#s dataset and evaluate #ts
accuracy.
"""

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('dataset.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the preprocessing steps for the numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the preprocessing steps for the categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessor to be used in the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include=object))
])

# Define the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10)

# Define the Logistic Regression Classifier
lr = LogisticRegression()

# Define the Voting Classifier to combine the predictions of the Random Forest Classifier and the Logistic Regression Classifier
voting_clf = VotingClassifier(estimators=[
    ('rfc', rfc),
    ('lr', lr)
], voting='hard')

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting_clf', voting_clf)
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the accuracy of the pipeline on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
