# Visualization Techniques

This notebook provides visualizations to gain insights from classification and regression results.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm

# Set the style for seaborn
sns.set(style='whitegrid')


## 1. Confusion Matrix for Classification Results

Let's use the Iris dataset as an example to visualize the confusion matrix.

In [2]:
# Load the dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## 2. ROC Curve for Model Performance

We will calculate the ROC curve for the multi-class classification.

In [3]:
# Calculate ROC curve
y_score = clf.predict_proba(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(len(data.target_names)):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_score[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve
plt.figure()
for i in range(len(data.target_names)):
    plt.plot(fpr[i], tpr[i], lw=2, label='ROC curve (area = {:.2f}) for class {}'.format(roc_auc[i], data.target_names[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

## 3. Feature Importance Plots

Understanding which features contribute most to the model.

In [4]:
# Feature importance
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot feature importance
plt.figure()
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[indices], color='b', align='center')
plt.xticks(range(X.shape[1]), np.array(data.feature_names)[indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

## 4. Residual Plots for Regression Analysis

We'll use a simple linear regression model for demonstration.

In [5]:
# Generate synthetic regression data
X_reg = np.random.rand(100, 1) * 10
y_reg = 2.5 * X_reg.flatten() + np.random.randn(100) * 2

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_reg, y_reg)

# Predictions
y_pred_reg = model.predict(X_reg)

# Calculate residuals
residuals = y_reg - y_pred_reg

# Residual plot
plt.figure()
plt.scatter(y_pred_reg, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Values')
plt.show()

## 5. Histograms or Box Plots for Feature Distributions

Visualizing the distribution of features.

In [6]:
# Histograms
plt.figure(figsize=(12, 6))
for i in range(X.shape[1]):
    plt.subplot(2, 2, i + 1)
    sns.histplot(X[:, i], bins=20, kde=True)
    plt.title(data.feature_names[i])
plt.tight_layout()
plt.show()