# Q1

In [36]:
# Importing all libraries necessary:-
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [37]:
# lets say we have tips dataset on our hands
df=sns.load_dataset('tips')

In [38]:
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [39]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [40]:
X=df.drop('time',axis=1)
y=df['time']

In [41]:
# performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# dividing the features into categorical and numerical:-
categorical_cols=['sex','smoker','day']
numerical_cols=['total_bill','tip','size']

In [43]:
# numerical pipeline for feature engineering
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='mean')),# handles missing values
          ('scaler',StandardScaler())] # feature scaling
)

In [44]:
# categorical pipeline for feature engineering
cat_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='most_frequent')),# handles missing values
          ('one_hot_encoder',OneHotEncoder())]# categorical feature to numerical
)

In [45]:
# combining the pipelines using ColumnTransformer:-
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)]
)

In [46]:
# Here all the feature engineering is done automatically after this test

X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

# Building the random forest classifier model:-

In [48]:
# making the instance of random forest classifier
rfc=RandomForestClassifier()

In [49]:
rfc.fit(X_train,y_train)

In [50]:
# predicting the value of y or we can say predicting the outcome:-
y_pred=rfc.predict(X_test)

In [51]:
# Printing the accuracy score of the random forest classifier:-
accuracy=accuracy_score(y_test,y_pred)*100
print(f"The accuracy of our random forest classifier model is :{accuracy:.2f} %")

The accuracy of our random forest classifier model is :95.92 %


# Interpretations: The pipeline is designed to handle missing values and apply appropriate preprocessing steps for both numerical and categorical features. The Random Forest Classifier is used as the final model to make predictions. The accuracy score on the test data provides an indication of how well the model performs on unseen data

# Possible improvements:

# 1) Hyperparameter tuning: 
we can perform hyperparameter tuning for the Random Forest Classifier to optimize its performance.

# 2) Feature selection:
Instead of using an automated feature selection method, we can experiment with different feature selection techniques to improve the model's efficiency and generalization.

# 3) Different models: 
we can experiment with other classification algorithms to see if they perform better on our dataset.

# Q2

In [68]:
# Importing important libraries:-
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [69]:
# Load the Iris dataset
iris = load_iris()
X=pd.DataFrame(data=iris.data,columns=iris.feature_names)
y=iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [70]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [71]:
# numerical pipeline for feature engineering
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='mean')),# handles missing values
          ('scaler',StandardScaler())] # feature scaling
)

In [72]:
# combining the pipelines using ColumnTransformer:-
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,list(X.columns))]
)

In [73]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [74]:
# Random Forest Classifier pipeline
rfc = RandomForestClassifier()

# Logistic Regression Classifier pipeline
lrc = LogisticRegression()

In [75]:
# Create the Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[('rf', rfc), ('lr', lrc)],
    voting='hard'  # Use majority voting for classification
)


In [77]:
# Fit the Voting Classifier on the training data
voting_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = voting_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 1.00
