## Introduction
This notebook demonstrates a supervised machine learning approach to classifying iris flower species.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Data Loading
The dataset is loaded and inspected to understand its structure and features.

In [None]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
df = iris.frame

In [None]:
df.head()

In [None]:
df.describe()

## Exploratory Data Analysis
Basic exploration on the data to identify patterns and understand relationships in the data.

In [None]:
#use sns pairplot to visualize the whole dataset information
sns.pairplot(df, hue='target')
plt.show()

In [None]:
#separate features and target
data=df.values
X=data[:, 0:4]
Y=data[:,4]

In [None]:
#Calculate the average for each feature for all classes
# Calculate average of each features for all classes
# Calculate average of each features for all classes
columns = ['Sepal length', 'Sepal width', 'Petal length', 'Petal width', 'Class_labels'] 
Y_Data = np.array([
    [np.average(X[Y==j, i]).astype('float32') for j in np.unique(Y)]
    for i in range(X.shape[1])
])
Y_Data_reshaped = Y_Data.reshape(4, 3)
Y_Data_reshaped = np.swapaxes(Y_Data_reshaped, 0, 1)
X_axis = np.arange(len(columns)-1)
width = 0.25

In [None]:
# Plot the average
plt.bar(X_axis, Y_Data_reshaped[0], width, label = 'Setosa')
plt.bar(X_axis+width, Y_Data_reshaped[1], width, label = 'Versicolour')
plt.bar(X_axis+width*2, Y_Data_reshaped[2], width, label = 'Virginica')
plt.xticks(X_axis, columns[:4])
plt.xlabel("Features")
plt.ylabel("Value in cm.")
plt.legend(bbox_to_anchor=(1.3,1))
plt.show()

## Data Preparation
preaparing features and target variables for model training.

In [None]:
y = df.target
X = df.drop(['target'], axis=1)


num_cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
cat_cols = []

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
num_transformer = SimpleImputer(strategy=('mean'))

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('oneHot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(transformers=[
   ( 'numerical', num_transformer, num_cols)
])


## Model Training
Used scikit_learn to train supervised classification models 

In [None]:
#define the model
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_valid = le.fit_transform(y_valid)

In [None]:
#create and evaluate the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# model fit
pipeline.fit(X_train, y_train)

## Model Evaluation
Evaluated the trained models using mean absolute error.

In [None]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(X_valid)

mae = mean_absolute_error(predictions, y_valid)
print("Mean_absolute_error:", mae)

## Conclusion
The results show that classical machine learning models perform well on structured datasets like Iris.