# Decision Tree Implementation and Visualization
## CODTECH Internship - Task 1

**Objective:** Build and visualize a Decision Tree model using scikit-learn to classify outcomes.

**Dataset:** Iris Dataset (Classification)

---

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    roc_curve,
    auc
)
from sklearn.preprocessing import label_binarize

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Tree export
from sklearn.tree import export_text, export_graphviz

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Information:")
print("=" * 50)
df.info()

print("\nStatistical Summary:")
print("=" * 50)
df.describe()

In [None]:
# Check class distribution
print("Class Distribution:")
print(df['species'].value_counts())

# Visualize class distribution
plt.figure(figsize=(8, 6))
df['species'].value_counts().plot(kind='bar', color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
plt.title('Distribution of Iris Species', fontsize=16, fontweight='bold')
plt.xlabel('Species', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Data Visualization

In [None]:
# Pairplot to visualize relationships
plt.figure(figsize=(14, 10))
sns.pairplot(df, hue='species', palette='Set2', diag_kind='kde', height=2.5)
plt.suptitle('Pairplot of Iris Features by Species', y=1.02, fontsize=16, fontweight='bold')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## 4. Prepare Data for Modeling

In [None]:
# Split features and target
X = df[iris.feature_names]
y = df['target']

# Split into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nFeatures: {list(X.columns)}")
print(f"Target classes: {iris.target_names}")

## 5. Build Decision Tree Model

In [None]:
# Create Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(
    criterion='gini',           # Split criterion: 'gini' or 'entropy'
    max_depth=4,                # Maximum depth of tree
    min_samples_split=2,        # Minimum samples required to split
    min_samples_leaf=1,         # Minimum samples required at leaf node
    random_state=42
)

# Train the model
dt_classifier.fit(X_train, y_train)

print("✓ Decision Tree model trained successfully!")
print(f"\nTree Depth: {dt_classifier.get_depth()}")
print(f"Number of Leaves: {dt_classifier.get_n_leaves()}")
print(f"Number of Features: {dt_classifier.n_features_in_}")

## 6. Model Predictions and Evaluation

In [None]:
# Make predictions
y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Model Performance:")
print("=" * 50)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"\nOverfitting Check: {abs(train_accuracy - test_accuracy):.4f}")
if abs(train_accuracy - test_accuracy) < 0.05:
    print("✓ Model is well-generalized (low overfitting)")
else:
    print("⚠ Model may be overfitting")

In [None]:
# Cross-validation score
cv_scores = cross_val_score(dt_classifier, X, y, cv=5, scoring='accuracy')

print("\nCross-Validation Results (5-Fold):")
print("=" * 50)
print(f"Individual Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Detailed classification report
print("\nClassification Report:")
print("=" * 50)
print(classification_report(y_test, y_test_pred, target_names=iris.target_names))

## 7. Confusion Matrix Visualization

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Visualize confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Decision Tree Classifier', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

print("Confusion Matrix:")
print(cm)

## 8. Decision Tree Visualization

In [None]:
# Visualize the decision tree
plt.figure(figsize=(20, 12))
plot_tree(dt_classifier, 
          feature_names=iris.feature_names,
          class_names=iris.target_names,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree Visualization', fontsize=20, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## 9. Feature Importance Analysis

In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': dt_classifier.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance Ranking:")
print("=" * 50)
print(feature_importance.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(feature_importance)))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Importance in Decision Tree', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 10. Text Representation of Decision Tree

In [None]:
# Export tree as text
tree_rules = export_text(dt_classifier, feature_names=list(iris.feature_names))

print("Decision Tree Rules:")
print("=" * 70)
print(tree_rules)

## 11. Model Comparison: Different Max Depths

In [None]:
# Test different tree depths
depths = range(1, 11)
train_scores = []
test_scores = []

for depth in depths:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    train_scores.append(dt.score(X_train, y_train))
    test_scores.append(dt.score(X_test, y_test))

# Visualize the effect of tree depth
plt.figure(figsize=(12, 6))
plt.plot(depths, train_scores, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(depths, test_scores, marker='s', label='Testing Accuracy', linewidth=2)
plt.xlabel('Tree Depth', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Performance vs Tree Depth', fontsize=16, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

optimal_depth = depths[np.argmax(test_scores)]
print(f"\nOptimal Tree Depth: {optimal_depth}")
print(f"Best Test Accuracy: {max(test_scores):.4f}")

## 12. Prediction Examples

In [None]:
# Make predictions on new data points
sample_data = X_test.head(5)
predictions = dt_classifier.predict(sample_data)
probabilities = dt_classifier.predict_proba(sample_data)

print("Sample Predictions:")
print("=" * 80)
for i, (idx, row) in enumerate(sample_data.iterrows()):
    print(f"\nSample {i+1}:")
    print(f"Features: {row.to_dict()}")
    print(f"Predicted Class: {iris.target_names[predictions[i]]}")
    print(f"Actual Class: {iris.target_names[y_test.iloc[i]]}")
    print(f"Prediction Probabilities:")
    for j, species in enumerate(iris.target_names):
        print(f"  - {species}: {probabilities[i][j]:.4f}")
    print("-" * 80)

## 13. Summary and Conclusions

In [None]:
print("=" * 70)
print("DECISION TREE MODEL SUMMARY")
print("=" * 70)
print(f"\nDataset: Iris Dataset")
print(f"Total Samples: {len(df)}")
print(f"Number of Features: {len(iris.feature_names)}")
print(f"Number of Classes: {len(iris.target_names)}")
print(f"\nModel Configuration:")
print(f"  - Algorithm: Decision Tree Classifier")
print(f"  - Criterion: {dt_classifier.criterion}")
print(f"  - Max Depth: {dt_classifier.max_depth}")
print(f"  - Tree Depth Achieved: {dt_classifier.get_depth()}")
print(f"  - Number of Leaves: {dt_classifier.get_n_leaves()}")
print(f"\nPerformance Metrics:")
print(f"  - Training Accuracy: {train_accuracy:.4f}")
print(f"  - Testing Accuracy: {test_accuracy:.4f}")
print(f"  - Cross-Validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"\nMost Important Feature: {feature_importance.iloc[0]['Feature']}")
print(f"\nConclusion:")
print(f"  The Decision Tree model successfully classifies Iris species with")
print(f"  {test_accuracy*100:.2f}% accuracy on the test set. The model shows good")
print(f"  generalization with minimal overfitting.")
print("=" * 70)

## Key Insights:

1. **Model Performance**: The Decision Tree achieved high accuracy in classifying Iris species
2. **Feature Importance**: Petal measurements (length and width) are the most important features
3. **Generalization**: The model generalizes well with similar training and testing accuracy
4. **Tree Structure**: The tree is interpretable with clear decision rules at each node
5. **Optimal Depth**: Testing different depths helps prevent overfitting

## Advantages of Decision Trees:
- Easy to interpret and visualize
- Requires little data preprocessing
- Can handle both numerical and categorical data
- Non-parametric (no assumptions about data distribution)

## Limitations:
- Prone to overfitting with deep trees
- Can be unstable (small data changes affect structure)
- May create biased trees if classes are imbalanced

---
**Task Completed Successfully! ✓**