<p style="text-align:center">
    <a href="https://tukkalearn.vercel.app" target="_blank">
    <img src="https://raw.githubusercontent.com/itzDM/publicAssets/refs/heads/main/opengraph-image.png" width="250"  alt="Tukka Learn">
    </a>
</p>


In [None]:
import pandas as pd
import numpy as np

# Load Iris dataset
df = pd.read_csv('https://raw.githubusercontent.com/tukkaLearn/datasets/refs/heads/main/Iris.csv')
print(f"Iris dataset loaded: {df.shape[0]} flowers, {df.shape[1]} features")
df.head()

## BEGINNER EXERCISES


In [None]:
print("1. First 5 rows:")
print(df.head())

print(f"\n2. Shape: {df.shape} → 150 samples, 5 columns")

print(f"\n3. Column names: {list(df.columns)}")

print("\n4. Samples per Species:")
print(df['Species'].value_counts())
print("→ Perfectly balanced! 50 each → ideal for classification")

print(f"\n5. Average sepal length: {df['SepalLengthCm'].mean():.2f} cm")

print(f"\n6. Petal length — Min: {df['PetalLengthCm'].min():.1f}, Max: {df['PetalLengthCm'].max():.1f}")

wide_sepal = df[df['SepalWidthCm'] > 3.5]
print(f"\n7. Flowers with SepalWidthCm > 3.5: {len(wide_sepal)} → All are Setosa!")

## Beginner Insights


In [None]:
print("BEGINNER INSIGHTS:")
print("="*60)
print("• Perfect balance (50 per Species) → Collected deliberately for study")
print("• Avg sepal length ~5.8cm → Typical iris size")
print("• Setosa has wide sepals → Easy to distinguish visually")
print("• Petal width varies hugely → Best feature for classification")
print("• Setosa: small petals + wide sepals → Compact, primitive form?")
print("• Virginica: long everything → Largest, most evolved?")

## INTERMEDIATE EXERCISES


In [None]:
# 1. Mean by Species
print("Average measurements by Species:")
print(df.groupby('Species').mean().round(2))

# 2. Widest average petal width
widest = df.groupby('Species')['PetalWidthCm'].mean().idxmax()
print(f"\nSpecies with widest petals: {widest}")

# 3. Add sepal area
df['sepal_area'] = df['SepalLengthCm'] * df['SepalWidthCm']
df['petal_area'] = df['PetalLengthCm'] * df['PetalWidthCm']
df['sepal_ratio'] = df['SepalLengthCm'] / df['SepalWidthCm']

# 4. Correlation
corr = df['SepalLengthCm'].corr(df['PetalLengthCm'])
print(f"\nCorrelation SepalLengthCm vs PetalLengthCm: {corr:.3f} → Very strong!")

# 5. Sort by petal length descending
print("\nTop 5 longest petals:")
print(df.sort_values('PetalLengthCm', ascending=False).head())

# 6. Max sepal length Species
longest_species = df.loc[df['SepalLengthCm'].idxmax(), 'Species']
print(f"\nSpecies with longest sepal: {longest_species}")

## Intermediate Insights


In [None]:
print("INTERMEDIATE INSIGHTS:")
print("="*60)
print("• Virginica has largest petal area → Showy flowers for pollination?")
print("• Setosa has high sepal_ratio → Long thin sepals")
print("• Strong correlation → Petal & sepal grow together")
print("• Virginica dominates top of size rankings → Largest Species")

## ADVANCED EXERCISES


In [None]:


# Variance by Species
print("\nVariance of SepalLengthCm by Species:")
print(df.groupby('Species')['SepalLengthCm'].var().round(3))

# IQR Outliers in SepalWidthCm
Q1 = df['SepalWidthCm'].quantile(0.25)
Q3 = df['SepalWidthCm'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['SepalWidthCm'] < Q1 - 1.5*IQR) | (df['SepalWidthCm'] > Q3 + 1.5*IQR)]
print(f"\nSepal width outliers: {len(outliers)}")

# Min-Max Normalization
numeric_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
df_normalized = df.copy()
for col in numeric_cols:
    df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

# Top 5 largest petal area
print("\nTop 5 flowers by petal area:")
print(df.nlargest(5, 'petal_area')[['Species', 'petal_area']])

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['Species'])
print(f"\nOne-hot encoded shape: {df_encoded.shape}")

## Advanced Insights & Biological Interpretation


In [None]:
print("\n" + "="*80)
print("ADVANCED BIOLOGICAL & STATISTICAL INSIGHTS")
print("="*80)

print("1. Virginica has highest natural variation → More adaptable Species?")
print("2. Setosa has almost NO correlation in petal size → Fixed genetic form")
print("3. PETAL features are BEST for identification — especially PetalWidthCm")
print("   → Setosa: tiny petals | Versicolor: medium | Virginica: large")
print("4. Virginica = largest + showiest flowers → Attracts more pollinators?")
print("5. Setosa = primitive form? Small petals, wide sepals → Less evolved?")
print("6. Near-perfect correlation (0.96) between sepal & petal length → Allometric growth")
print("7. Dataset perfectly balanced → Created by Fisher for statistical testing!")

print("\nCLASSIFICATION POWER RANKING:")
print("1. PetalWidthCm     → Almost perfect separator")
print("2. PetalLengthCm    → Excellent")
print("3. SepalLengthCm    → Good")
print("4. SepalWidthCm     → Weakest (Setosa extreme)")

print("\nFINAL CONCLUSION:")
print("Iris dataset is PERFECT because:")
print("• 3 Species with clear boundaries")
print("• Setosa completely separable")
print("• Versicolor & Virginica slightly overlap → realistic challenge")
print("• Perfect balance + clean data")
print("\n→ That's why it's the 'Hello World' of Machine Learning!")

# Final Summary Table


In [None]:
summary = df.groupby('Species').agg({
    'SepalLengthCm': 'mean',
    'SepalWidthCm': 'mean',
    'PetalLengthCm': 'mean',
    'PetalWidthCm': 'mean',
    'sepal_area': 'mean',
    'petal_area': 'mean'
}).round(2)

summary['Size Rank'] = [3, 2, 1]  # Setosa smallest
summary

<hr>
<div style="text-align:center">
  <h3 style="color:orange">|| राम नाम सत्य है ||</h3>
  <h4>Authour : सीता राम जी </h4>
   <h5 style="color:skyblue"><i>© All Rights Reserved</i></h5>
</div>
