# Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris

# 1. Data Collection and Preprocessing

In [None]:
# Load the Iris Dataset
data = load_iris()

In [None]:
print(data.__dir__())

In [None]:
# Print the first 10 rows of the data
print(data.data[:10])
# Print feature names
print(data.feature_names)

## Exercise: Can you print out the first 10 rows of the target? 

In [None]:
# Print first 10 targets (FILL IN next line)

# Print the class/species names (FILL IN next line)


## Data Integration

In [None]:
# Create a pandas data frame to store the features with 
df = pd.DataFrame(data.data, columns=data.feature_names)
df['species'] = data.target

# 2. Exploratory Data Analysis (EDA)

In [None]:
# Display first few rows using the head function
display(df.head())

In [None]:
display(df.tail())

In [None]:
display(df[:10])

## Exercise. Can you display rows 70 - 80? 

In [None]:
# FILL IN here to display rows 70 - 80


In [None]:
# Print summary statistics
display(df.describe())

In [None]:
# Print only the minimum value
df.describe().loc[['min']]

In [None]:
# Print the minimum value of only the features
df.iloc[:, :-1].describe().loc[['min']]

## Exercise
The five point summary of each feature comprises the minimum, lower quartile, median, upper quartile and maximum. Can you modify the code in the previous two cells to print the 5-point summary of only the features?

In [None]:
# FILL IN Print the five point summary of onlt the features


# 3. EDA with Visualization

# 3.1 Univariate Histograms

In [None]:
feature_names = df.columns[:4]  
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()
for i, feature in enumerate(feature_names):
    axes[i].hist(df[feature], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f"Histogram of {feature}")
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

## 3.2 Scatter Plots (Bi-Variate)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df['sepal length (cm)'], df['sepal width (cm)'])
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df["sepal length (cm)"], df["sepal width (cm)"], c=df["species"], cmap = "viridis")
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sc = plt.scatter(df["sepal length (cm)"], df["sepal width (cm)"], 
                 c=df["species"], cmap="viridis")
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')

legend_labels = [plt.Line2D([0], [0], marker='o', color='w', markersize=10,
                            markerfacecolor=sc.get_cmap()(i/2)) 
                 for i in range(3)]
plt.legend(legend_labels, data.target_names, title="Species")

plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x="sepal length (cm)", y="sepal width (cm)", 
                hue="species", palette="viridis")
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend(title="Species")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x="sepal length (cm)", y="sepal width (cm)", 
                hue=df["species"].map({0: "setosa", 1: "versicolor", 2: "virginica"}), 
                palette="viridis")
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend(title="Species")
plt.show()

## Exercise: Create a scatter plot of petal width versus petal length

In [None]:
### FILL IN the code in this cell

## 3.2.1 Box and Violin Plots

In [None]:
plt.figure(figsize=(12, 6))

# Create boxplots for all features
plt.boxplot([df[col] for col in df.columns[:-1]], labels=df.columns[:-1])

# Labels and title
plt.xlabel("Features")
plt.ylabel("Value")
plt.title("Boxplots of Iris Features (Matplotlib)")
plt.grid()

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df_melted = df.iloc[:, :-1].melt(var_name="Feature", value_name="Value")
sns.boxplot(data=df_melted, x="Feature", y="Value")
plt.xlabel("Features")
plt.ylabel("Value")
plt.title("Boxplots of Iris Features (Seaborn)")
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df_melted = df.melt(id_vars="species", var_name="Feature", value_name="Value")
sns.boxplot(data=df_melted, x="Feature", y="Value", hue="species", palette="viridis")
plt.xlabel("Features")
plt.ylabel("Value")
plt.title("Boxplots of Iris Features (Seaborn)")
plt.legend(title="Species")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df_melted = df.iloc[:, :-1].melt(var_name="Feature", value_name="Value")
sns.violinplot(data=df_melted, x="Feature", y="Value")
plt.xlabel("Features")
plt.ylabel("Value")
plt.title("Boxplots of Iris Features (Seaborn)")
plt.show()

## 3.3 Correlation Maps and Pair-Plots (Multi-Variate)

In [None]:
# Pairplot to visualize feature relationships
sns.pairplot(df, hue='species', diag_kind='kde')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

## Exercise. Can you restrict the correlation map to show only species with a 'coolwarm' colormap? 

In [None]:
# FILL IN the code in this cell


# 4. Unsupervised Learning with k-means

In [None]:
# Normalize features with standard scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.iloc[:, :4])
# Apply K-Means with 3 clusters
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['cluster'] = kmeans_labels

# Visualize clusters
plt.figure(figsize=(8,6))
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 2], hue=df['cluster'], palette='viridis')
plt.xlabel(data.feature_names[0])
plt.ylabel(data.feature_names[2])
plt.title("K-Means Clustering on Iris Dataset")
plt.show()

## Exercise: Can you run the k-means algorithm to generate 3 clusters?

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.iloc[:, :4])
### MODIFY the next line correctly
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
###
kmeans_labels = kmeans.fit_predict(X_scaled)

df['cluster'] = kmeans_labels

plt.figure(figsize=(8,6))
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 2], hue=df['cluster'], palette='viridis')
plt.xlabel(data.feature_names[0])
plt.ylabel(data.feature_names[2])
plt.title("K-Means Clustering on Iris Dataset")
plt.show()

In [None]:
display(df[:10])
display(df[50:60])
display(df[100:110])

In [None]:
# Set up a 1-row, 2-column subplot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Left plot: K-Means Clusters
sns.scatterplot(ax=axes[0], data=df, x="sepal length (cm)", y="sepal width (cm)", 
                hue=df["cluster"], palette="viridis", edgecolor='k')
axes[0].set_title("K-Means Clustering (3 Clusters)")
axes[0].set_xlabel("Sepal Length (cm)")
axes[0].set_ylabel("Sepal Width (cm)")
axes[0].legend(title="Cluster")

# Right plot: True Species Labels
sns.scatterplot(ax=axes[1], data=df, x="sepal length (cm)", y="sepal width (cm)", 
                hue=df["species"], palette="viridis", edgecolor='k')
axes[1].set_title("True Labels (Species)")
axes[1].set_xlabel("Sepal Length (cm)")
axes[1].set_ylabel("Sepal Width (cm)")
axes[1].legend(title="Species")

# Show the plots
plt.tight_layout()
plt.show()

## Exercise: Replace sepal width/length with petal width/length

In [None]:
### FILL IN the code in this cell

In [None]:
centroids = kmeans.cluster_centers_  # Get cluster centroids

# Set up a 1-row, 2-column subplot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Left plot: K-Means Clusters
sns.scatterplot(ax=axes[0], data=df, x="petal length (cm)", y="petal width (cm)", 
                hue=df["cluster"], palette="viridis", edgecolor='k')
axes[0].scatter(centroids[:, 0], centroids[:, 1], marker='X', s=200, c='red', edgecolor='k', label="Centroids")
axes[0].set_title("K-Means Clustering (3 Clusters)")
axes[0].set_xlabel("petal Length (cm)")
axes[0].set_ylabel("petal Width (cm)")
axes[0].legend(title="Cluster")

# Right plot: True Species Labels
sns.scatterplot(ax=axes[1], data=df, x="petal length (cm)", y="petal width (cm)", 
                hue=df["species"], palette="viridis", edgecolor='k')
axes[1].set_title("True Labels (Species)")
axes[1].set_xlabel("petal Length (cm)")
axes[1].set_ylabel("petal Width (cm)")
axes[1].legend(title="Species")

plt.tight_layout()
plt.show()

In [None]:

cluster_colors = sns.color_palette("viridis", n_colors=3)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.scatterplot(ax=axes[0], data=df, x="petal length (cm)", y="petal width (cm)", 
                hue=df["cluster"], palette=cluster_colors, edgecolor='k')
for i, centroid in enumerate(centroids):
    axes[0].scatter(centroid[0], centroid[1], marker='X', s=200, c=[cluster_colors[i]], 
                    edgecolor='k', label=None)

axes[0].set_title("K-Means Clustering (3 Clusters)")
axes[0].set_xlabel("petal Length (cm)")
axes[0].set_ylabel("petal Width (cm)")
axes[0].legend(title="Cluster")

sns.scatterplot(ax=axes[1], data=df, x="petal length (cm)", y="petal width (cm)", 
                hue=df["species"].map({0: "setosa", 1: "versicolor", 2: "virginica"}), 
                palette=cluster_colors, edgecolor='k')

axes[1].set_title("True Labels (Species)")
axes[1].set_xlabel("petal Length (cm)")
axes[1].set_ylabel("petal Width (cm)")
axes[1].legend(title="Species")

plt.tight_layout()
plt.show()

## Take-home: 
1. For visualization, reduce the feature set with PCA and visualize with "new" features.
2. Run k-means on the reduced feature set.

# 5. Supervised Learning

In [None]:
## Supervised Learning: Decision Tree Classifier
# Split data into train and test sets
X = df.iloc[:, :4]  # Features
y = df['species']  # Target
print('X shape:', X.shape)
print('y shape:', y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
print(f"The training set contains {X_train.shape[0]} samples each comprising {X_train.shape[1]} features.")
print(f"There are {y_train.shape[0]} labels associated with the train set.")
print(f"The test set contains {X_test.shape[0]} samples comprising {X_test.shape[1]} features.")
print(f"There are {y_test.shape[0]} labels associated with the test set.")

In [None]:
# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = clf.predict(X_test)

In [None]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))

In [None]:
plt.figure(figsize=(20, 15))
plot_tree(clf, feature_names=data.feature_names, class_names=data.target_names, filled=True)
plt.show()