In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

## --- 1. Data Loading and Inspection ---
print("## 1. Data Loading and Inspection")
# Load the built-in Iris dataset
iris = load_iris()

# Create a DataFrame for easier handling and inspection
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species_name'] = iris.target_names[iris.target]

print("First 5 rows of the dataset:")
print(df.head())
print("\nTarget Class Distribution:")
print(df['species_name'].value_counts())

## --- 2. Data Preprocessing (Handling missing values & Encoding) ---
print("\n## 2. Data Preprocessing")

# a) Handle Missing Values (Check for simplicity, Iris is usually clean)
if df.isnull().sum().sum() > 0:
    print("WARNING: Missing values found. Handling them (e.g., imputation or removal)...")
    # In a real-world scenario, you might impute: df.fillna(df.median(), inplace=True)
else:
    print("No missing values found in the Iris dataset.")

# b) Encode Labels (Target variable: species_name)
# Machine learning models require numerical labels.
le = LabelEncoder()
y_encoded = le.fit_transform(iris.target)
X = iris.data # Features

# c) Split Data into Training and Testing sets
# Use a standard 70/30 split (70% for training, 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

## --- 3. Model Training (Decision Tree Classifier) ---
print("\n## 3. Model Training")

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model using the training data
print("Training Decision Tree Classifier...")
dt_classifier.fit(X_train, y_train)
print("Training complete.")

## --- 4. Model Evaluation ---
print("\n## 4. Model Evaluation")

# Predict the species on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
# Use 'macro' for unweighted mean of metrics for each label (good for balanced datasets like Iris)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')

# Display Results
print(f"Decision Tree Classifier Results:")
print(f"  Accuracy: *{accuracy:.4f}* (Overall correct predictions)")
print(f"  Precision (Macro): *{precision_macro:.4f}* (Ability to not label as positive a sample that is negative)")
print(f"  Recall (Macro): *{recall_macro:.4f}* (Ability to find all the positive samples)")

# You can also get a full report for all classes
from sklearn.metrics import classification_report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

## --- 5. Example Prediction (Optional but useful for deliverable) ---
# Pick the first sample from the test set
sample_index = 0
sample_features = X_test[sample_index].reshape(1, -1)
sample_true_label = le.inverse_transform([y_test[sample_index]])[0]

# Make a prediction
sample_prediction_encoded = dt_classifier.predict(sample_features)[0]
sample_prediction_name = le.inverse_transform([sample_prediction_encoded])[0]

print("\n--- Example Prediction ---")
print(f"Features: {sample_features[0]}")
print(f"True Species: *{sample_true_label}*")
print(f"Predicted Species: *{sample_prediction_name}*")

## 1. Data Loading and Inspection
First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species_name  
0       setosa  
1       setosa  
2       setosa  
3       setosa  
4       setosa  

Target Class Distribution:
species_name
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

## 2. Data Preprocessing
No missing values found in the Iris dataset.
X_train shape: (105, 4), X_test shape: (45, 4)

## 3. Model Training
Training Decision Tree Classifier...
Training complete.

## 4. Model Evaluation
Decision Tree Classifi

In [None]:
# Necessary imports for data preparation and model
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# --- Data Preparation (to define X_train, y_train) ---
# Load the built-in Iris dataset
iris = load_iris()

# Encode Labels (Target variable: species_name)
le = LabelEncoder()
y_encoded = le.fit_transform(iris.target)
X = iris.data # Features

# Split Data into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# --- Model Training ---
# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model using the training data
dt_classifier.fit(X_train, y_train)