# SciTeX AI - Machine Learning and AI Workflows

This notebook demonstrates the AI and machine learning capabilities of SciTeX's AI module.

The `scitex.ai` module provides:
- Classification workflows with comprehensive reporting
- Feature extraction and selection
- Model evaluation and validation
- Hyperparameter optimization
- Neural network utilities
- Generative AI integration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scitex as stx
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Setup
np.random.seed(42)
print(f"SciTeX version: {stx.__version__}")

## 1. Classification with SciTeX AI

Comprehensive classification workflow with automatic reporting.

In [None]:
# Generate synthetic classification dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=3,
    n_classes=3,
    n_clusters_per_class=1,
    class_sep=0.8,
    random_state=42
)

# Create feature names
feature_names = [f'feature_{i+1}' for i in range(X.shape[1])]
class_names = ['Class_A', 'Class_B', 'Class_C']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution: {np.bincount(y)}")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Initialize SciTeX Classification Reporter
reporter = stx.ai.ClassificationReporter(
    feature_names=feature_names,
    class_names=class_names
)

# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# Scale features for SVM and Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

results = {}

print("Training and Evaluating Models:")
print("=" * 40)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for SVM and Logistic Regression
    if name in ['SVM', 'Logistic Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
    
    # Generate comprehensive report
    report = reporter.evaluate(y_test, y_pred, y_proba)
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_proba,
        'report': report
    }
    
    print(f"  Accuracy: {report['accuracy']:.3f}")
    print(f"  F1-score (macro): {report['f1_macro']:.3f}")
    print(f"  AUC (macro): {report['auc_macro']:.3f}")

In [None]:
# Visualize classification results
fig, axes = stx.plt.subplots(2, 3, figsize=(15, 10))

# Plot confusion matrices
for i, (name, result) in enumerate(results.items()):
    ax = axes[0, i]
    cm = result['report']['confusion_matrix']
    
    im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
    ax.set_xyt('Predicted Label', 'True Label', f'{name}\nConfusion Matrix')
    
    # Add text annotations
    thresh = cm.max() / 2.
    for row in range(cm.shape[0]):
        for col in range(cm.shape[1]):
            ax.text(col, row, format(cm[row, col], 'd'),
                   ha="center", va="center",
                   color="white" if cm[row, col] > thresh else "black")
    
    ax.set_xticks(range(len(class_names)))
    ax.set_yticks(range(len(class_names)))
    ax.set_xticklabels(class_names)
    ax.set_yticklabels(class_names)

# Plot ROC curves
for i, (name, result) in enumerate(results.items()):
    ax = axes[1, i]
    
    # Plot ROC curve for each class
    colors = ['blue', 'red', 'green']
    for class_idx in range(len(class_names)):
        fpr = result['report']['roc_curves'][class_idx]['fpr']
        tpr = result['report']['roc_curves'][class_idx]['tpr']
        auc_score = result['report']['roc_curves'][class_idx]['auc']
        
        ax.plot(fpr, tpr, color=colors[class_idx], 
                label=f'{class_names[class_idx]} (AUC = {auc_score:.3f})')
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax.set_xyt('False Positive Rate', 'True Positive Rate', 
               f'{name}\nROC Curves')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, "./figures/classification_evaluation.png", symlink_from_cwd=True)
plt.show()

## 2. Feature Importance and Selection

Analyze feature importance and perform feature selection.

In [None]:
# Extract feature importance from Random Forest
rf_model = results['Random Forest']['model']
feature_importance = rf_model.feature_importances_

# Create feature importance DataFrame
feature_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print("=" * 35)
print(feature_df.head(10))

# Select top features
top_features = feature_df.head(10)['feature'].tolist()
top_feature_indices = [feature_names.index(feat) for feat in top_features]

print(f"\nSelected {len(top_features)} most important features")

In [None]:
# Retrain models with selected features
X_train_selected = X_train[:, top_feature_indices]
X_test_selected = X_test[:, top_feature_indices]
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

print("Performance Comparison: All Features vs Selected Features")
print("=" * 60)
print(f"{'Model':<20} {'All Features':<15} {'Selected Features':<15} {'Difference':<10}")
print("-" * 60)

selected_results = {}

for name, model_info in results.items():
    # Clone and retrain model with selected features
    model = type(model_info['model'])(**model_info['model'].get_params())
    
    if name in ['SVM', 'Logistic Regression']:
        model.fit(X_train_selected_scaled, y_train)
        y_pred_selected = model.predict(X_test_selected_scaled)
        y_proba_selected = model.predict_proba(X_test_selected_scaled)
    else:
        model.fit(X_train_selected, y_train)
        y_pred_selected = model.predict(X_test_selected)
        y_proba_selected = model.predict_proba(X_test_selected)
    
    # Evaluate with selected features
    report_selected = reporter.evaluate(y_test, y_pred_selected, y_proba_selected)
    
    # Compare performance
    original_acc = model_info['report']['accuracy']
    selected_acc = report_selected['accuracy']
    difference = selected_acc - original_acc
    
    print(f"{name:<20} {original_acc:<15.3f} {selected_acc:<15.3f} {difference:<+10.3f}")
    
    selected_results[name] = {
        'model': model,
        'report': report_selected
    }

In [None]:
# Visualize feature importance and selection results
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Feature importance plot
ax1 = axes[0, 0]
top_15 = feature_df.head(15)
bars = ax1.barh(range(len(top_15)), top_15['importance'], color='skyblue')
ax1.set_yticks(range(len(top_15)))
ax1.set_yticklabels(top_15['feature'])
ax1.set_xyt('Feature Importance', '', 'Top 15 Feature Importance')
ax1.grid(True, alpha=0.3)

# Performance comparison
ax2 = axes[0, 1]
model_names = list(results.keys())
all_features_acc = [results[name]['report']['accuracy'] for name in model_names]
selected_features_acc = [selected_results[name]['report']['accuracy'] for name in model_names]

x = np.arange(len(model_names))
width = 0.35

bars1 = ax2.bar(x - width/2, all_features_acc, width, label='All Features', alpha=0.7)
bars2 = ax2.bar(x + width/2, selected_features_acc, width, label='Selected Features', alpha=0.7)

ax2.set_xyt('Models', 'Accuracy', 'Performance: All vs Selected Features')
ax2.set_xticks(x)
ax2.set_xticklabels([name.replace(' ', '\n') for name in model_names])
ax2.legend()
ax2.grid(True, alpha=0.3)

# Feature correlation heatmap (top features)
ax3 = axes[1, 0]
top_feature_data = X[:, top_feature_indices[:10]]  # Top 10 for readability
correlation_matrix = np.corrcoef(top_feature_data.T)

im = ax3.imshow(correlation_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
ax3.set_xyt('Features', 'Features', 'Feature Correlation Matrix')
ax3.set_xticks(range(10))
ax3.set_yticks(range(10))
ax3.set_xticklabels([f'F{i+1}' for i in range(10)], rotation=45)
ax3.set_yticklabels([f'F{i+1}' for i in range(10)])

# Feature distribution by class
ax4 = axes[1, 1]
# Select the most important feature for visualization
most_important_idx = top_feature_indices[0]
most_important_data = X[:, most_important_idx]

for class_idx in range(len(class_names)):
    class_data = most_important_data[y == class_idx]
    ax4.hist(class_data, alpha=0.6, label=class_names[class_idx], bins=20)

ax4.set_xyt(f'{feature_names[most_important_idx]}', 'Frequency', 
            'Most Important Feature Distribution')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, "./figures/feature_analysis.png", symlink_from_cwd=True)
plt.show()

## 3. Hyperparameter Optimization

Optimize model hyperparameters using grid search and validation.

In [None]:
from sklearn.model_selection import GridSearchCV, validation_curve
from sklearn.metrics import accuracy_score

# Define parameter grids for optimization
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto', 0.1],
        'kernel': ['rbf', 'linear']
    }
}

print("Hyperparameter Optimization:")
print("=" * 35)

optimized_models = {}

for model_name, param_grid in param_grids.items():
    print(f"\nOptimizing {model_name}...")
    
    # Initialize base model
    if model_name == 'Random Forest':
        base_model = RandomForestClassifier(random_state=42)
        X_train_opt = X_train_selected
        X_test_opt = X_test_selected
    else:  # SVM
        base_model = SVC(probability=True, random_state=42)
        X_train_opt = X_train_selected_scaled
        X_test_opt = X_test_selected_scaled
    
    # Perform grid search
    grid_search = GridSearchCV(
        base_model, param_grid, 
        cv=5, scoring='accuracy', 
        n_jobs=-1, verbose=0
    )
    
    grid_search.fit(X_train_opt, y_train)
    
    # Get best model and evaluate
    best_model = grid_search.best_estimator_
    y_pred_opt = best_model.predict(X_test_opt)
    y_proba_opt = best_model.predict_proba(X_test_opt)
    
    # Generate report
    report_opt = reporter.evaluate(y_test, y_pred_opt, y_proba_opt)
    
    optimized_models[model_name] = {
        'model': best_model,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'test_accuracy': report_opt['accuracy'],
        'report': report_opt
    }
    
    print(f"  Best CV Score: {grid_search.best_score_:.3f}")
    print(f"  Test Accuracy: {report_opt['accuracy']:.3f}")
    print(f"  Best Parameters: {grid_search.best_params_}")
    
    # Compare with baseline
    baseline_acc = selected_results[model_name]['report']['accuracy']
    improvement = report_opt['accuracy'] - baseline_acc
    print(f"  Improvement: {improvement:+.3f}")

In [None]:
# Validation curves for key hyperparameters
fig, axes = stx.plt.subplots(1, 2, figsize=(12, 5))

# Random Forest: n_estimators validation curve
ax1 = axes[0]
n_estimators_range = [10, 50, 100, 150, 200, 250, 300]
train_scores, val_scores = validation_curve(
    RandomForestClassifier(random_state=42, max_depth=20),
    X_train_selected, y_train,
    param_name='n_estimators',
    param_range=n_estimators_range,
    cv=5, scoring='accuracy', n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

ax1.plot(n_estimators_range, train_scores_mean, 'o-', color='blue', label='Training score')
ax1.fill_between(n_estimators_range, 
                train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, 
                alpha=0.1, color='blue')

ax1.plot(n_estimators_range, val_scores_mean, 'o-', color='red', label='Cross-validation score')
ax1.fill_between(n_estimators_range, 
                val_scores_mean - val_scores_std,
                val_scores_mean + val_scores_std, 
                alpha=0.1, color='red')

ax1.set_xyt('Number of Estimators', 'Accuracy', 'Random Forest: Validation Curve')
ax1.legend()
ax1.grid(True, alpha=0.3)

# SVM: C parameter validation curve
ax2 = axes[1]
C_range = [0.01, 0.1, 1, 10, 100]
train_scores, val_scores = validation_curve(
    SVC(random_state=42, gamma='scale'),
    X_train_selected_scaled, y_train,
    param_name='C',
    param_range=C_range,
    cv=5, scoring='accuracy', n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

ax2.semilogx(C_range, train_scores_mean, 'o-', color='blue', label='Training score')
ax2.fill_between(C_range, 
                train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, 
                alpha=0.1, color='blue')

ax2.semilogx(C_range, val_scores_mean, 'o-', color='red', label='Cross-validation score')
ax2.fill_between(C_range, 
                val_scores_mean - val_scores_std,
                val_scores_mean + val_scores_std, 
                alpha=0.1, color='red')

ax2.set_xyt('C Parameter', 'Accuracy', 'SVM: Validation Curve')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, "./figures/hyperparameter_optimization.png", symlink_from_cwd=True)
plt.show()

## 4. Neural Network with SciTeX

Simple neural network using SciTeX's neural network utilities.

In [None]:
# Simple neural network implementation
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import learning_curve

# Train neural network with different architectures
nn_architectures = {
    'Small NN': (50,),
    'Medium NN': (100, 50),
    'Large NN': (200, 100, 50)
}

print("Neural Network Training:")
print("=" * 30)

nn_results = {}

for name, hidden_layers in nn_architectures.items():
    print(f"\nTraining {name} {hidden_layers}...")
    
    # Initialize and train neural network
    nn_model = MLPClassifier(
        hidden_layer_sizes=hidden_layers,
        max_iter=1000,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )
    
    nn_model.fit(X_train_selected_scaled, y_train)
    
    # Make predictions
    y_pred_nn = nn_model.predict(X_test_selected_scaled)
    y_proba_nn = nn_model.predict_proba(X_test_selected_scaled)
    
    # Evaluate
    report_nn = reporter.evaluate(y_test, y_pred_nn, y_proba_nn)
    
    nn_results[name] = {
        'model': nn_model,
        'architecture': hidden_layers,
        'n_params': sum([layer.size for layer in nn_model.coefs_]) + sum([layer.size for layer in nn_model.intercepts_]),
        'n_iter': nn_model.n_iter_,
        'loss_curve': nn_model.loss_curve_,
        'report': report_nn
    }
    
    print(f"  Architecture: {hidden_layers}")
    print(f"  Parameters: {nn_results[name]['n_params']}")
    print(f"  Iterations: {nn_results[name]['n_iter']}")
    print(f"  Test Accuracy: {report_nn['accuracy']:.3f}")
    print(f"  F1-score: {report_nn['f1_macro']:.3f}")

In [None]:
# Visualize neural network training and performance
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Loss curves during training
ax1 = axes[0, 0]
colors = ['blue', 'red', 'green']
for i, (name, result) in enumerate(nn_results.items()):
    ax1.plot(result['loss_curve'], color=colors[i], label=name)

ax1.set_xyt('Iteration', 'Loss', 'Neural Network Training Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Performance comparison
ax2 = axes[0, 1]
nn_names = list(nn_results.keys())
nn_accuracies = [nn_results[name]['report']['accuracy'] for name in nn_names]
nn_f1_scores = [nn_results[name]['report']['f1_macro'] for name in nn_names]

x = np.arange(len(nn_names))
width = 0.35

bars1 = ax2.bar(x - width/2, nn_accuracies, width, label='Accuracy', alpha=0.7)
bars2 = ax2.bar(x + width/2, nn_f1_scores, width, label='F1-score', alpha=0.7)

ax2.set_xyt('Neural Network', 'Score', 'Neural Network Performance')
ax2.set_xticks(x)
ax2.set_xticklabels(nn_names)
ax2.legend()
ax2.grid(True, alpha=0.3)

# Model complexity vs performance
ax3 = axes[1, 0]
nn_params = [nn_results[name]['n_params'] for name in nn_names]

ax3.scatter(nn_params, nn_accuracies, s=100, alpha=0.7, color='purple')
for i, name in enumerate(nn_names):
    ax3.annotate(name, (nn_params[i], nn_accuracies[i]), 
                xytext=(5, 5), textcoords='offset points')

ax3.set_xyt('Number of Parameters', 'Test Accuracy', 'Model Complexity vs Performance')
ax3.grid(True, alpha=0.3)

# Final model comparison (all models)
ax4 = axes[1, 1]
all_model_names = list(optimized_models.keys()) + list(nn_results.keys())
all_accuracies = ([optimized_models[name]['test_accuracy'] for name in optimized_models.keys()] +
                 [nn_results[name]['report']['accuracy'] for name in nn_results.keys()])

colors = ['lightblue', 'lightcoral', 'lightgreen', 'lightyellow', 'lightpink']
bars = ax4.bar(range(len(all_model_names)), all_accuracies, 
               color=colors[:len(all_model_names)], alpha=0.7)

# Add value labels on bars
for bar, acc in zip(bars, all_accuracies):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

ax4.set_xyt('Models', 'Test Accuracy', 'Final Model Comparison')
ax4.set_xticks(range(len(all_model_names)))
ax4.set_xticklabels([name.replace(' ', '\n') for name in all_model_names], rotation=0)
ax4.set_ylim(0.8, max(all_accuracies) + 0.05)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, "./figures/neural_network_analysis.png", symlink_from_cwd=True)
plt.show()

## 5. Learning Curves and Model Diagnostics

Analyze learning behavior and diagnose potential issues.

In [None]:
# Generate learning curves for best models
def plot_learning_curve(model, X, y, title, ax):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    
    ax.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training score')
    ax.fill_between(train_sizes, 
                    train_scores_mean - train_scores_std,
                    train_scores_mean + train_scores_std, 
                    alpha=0.1, color='blue')
    
    ax.plot(train_sizes, val_scores_mean, 'o-', color='red', label='Cross-validation score')
    ax.fill_between(train_sizes, 
                    val_scores_mean - val_scores_std,
                    val_scores_mean + val_scores_std, 
                    alpha=0.1, color='red')
    
    ax.set_xyt('Training Set Size', 'Accuracy', title)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    return train_sizes, train_scores_mean, val_scores_mean

# Plot learning curves for best models
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Random Forest learning curve
rf_model = optimized_models['Random Forest']['model']
plot_learning_curve(rf_model, X_train_selected, y_train, 
                   'Random Forest Learning Curve', axes[0, 0])

# SVM learning curve
svm_model = optimized_models['SVM']['model']
plot_learning_curve(svm_model, X_train_selected_scaled, y_train, 
                   'SVM Learning Curve', axes[0, 1])

# Neural Network learning curve
best_nn = nn_results['Medium NN']['model']
plot_learning_curve(best_nn, X_train_selected_scaled, y_train, 
                   'Neural Network Learning Curve', axes[1, 0])

# Bias-Variance Analysis
ax = axes[1, 1]
# Simulate bias-variance tradeoff
complexities = [1, 2, 5, 10, 20, 50, 100]
bias_errors = [0.15, 0.12, 0.08, 0.05, 0.03, 0.02, 0.015]  # Decreasing with complexity
variance_errors = [0.01, 0.015, 0.025, 0.04, 0.08, 0.15, 0.25]  # Increasing with complexity
total_errors = [b + v for b, v in zip(bias_errors, variance_errors)]

ax.plot(complexities, bias_errors, 'o-', label='Bias²', color='blue')
ax.plot(complexities, variance_errors, 's-', label='Variance', color='red')
ax.plot(complexities, total_errors, '^-', label='Total Error', color='green', linewidth=2)

ax.set_xyt('Model Complexity', 'Error', 'Bias-Variance Tradeoff')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xscale('log')

plt.tight_layout()
stx.io.save(fig, "./figures/learning_curves_diagnostics.png", symlink_from_cwd=True)
plt.show()

## 6. Model Interpretability and Explainability

Understand what the models have learned.

In [None]:
# Feature importance from different models
print("Model Interpretability Analysis:")
print("=" * 40)

# Random Forest feature importance
rf_importance = optimized_models['Random Forest']['model'].feature_importances_

# Logistic Regression coefficients (using all features for comparison)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_selected_scaled, y_train)
lr_coefs = np.abs(lr_model.coef_).mean(axis=0)  # Average absolute coefficients across classes

# Create feature importance comparison
importance_df = pd.DataFrame({
    'feature': [feature_names[i] for i in top_feature_indices],
    'rf_importance': rf_importance,
    'lr_importance': lr_coefs / lr_coefs.max()  # Normalize
})

print("\nFeature Importance Comparison:")
print(importance_df.sort_values('rf_importance', ascending=False).head(10))

# Prediction confidence analysis
best_model = optimized_models['Random Forest']['model']
y_proba_best = best_model.predict_proba(X_test_selected)
prediction_confidence = np.max(y_proba_best, axis=1)
predictions = np.argmax(y_proba_best, axis=1)
correct_predictions = (predictions == y_test)

print(f"\nPrediction Confidence Analysis:")
print(f"Mean confidence: {np.mean(prediction_confidence):.3f}")
print(f"Confidence for correct predictions: {np.mean(prediction_confidence[correct_predictions]):.3f}")
print(f"Confidence for incorrect predictions: {np.mean(prediction_confidence[~correct_predictions]):.3f}")

In [None]:
# Visualize model interpretability
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Feature importance comparison
ax1 = axes[0, 0]
x = np.arange(len(importance_df))
width = 0.35

bars1 = ax1.bar(x - width/2, importance_df['rf_importance'], width, 
                label='Random Forest', alpha=0.7)
bars2 = ax1.bar(x + width/2, importance_df['lr_importance'], width, 
                label='Logistic Regression', alpha=0.7)

ax1.set_xyt('Features', 'Importance', 'Feature Importance Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels([f'F{i+1}' for i in range(len(importance_df))], rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Prediction confidence distribution
ax2 = axes[0, 1]
ax2.hist(prediction_confidence[correct_predictions], alpha=0.7, 
         label='Correct Predictions', bins=20, color='green')
ax2.hist(prediction_confidence[~correct_predictions], alpha=0.7, 
         label='Incorrect Predictions', bins=20, color='red')

ax2.set_xyt('Prediction Confidence', 'Frequency', 'Confidence Distribution')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Class-wise prediction confidence
ax3 = axes[1, 0]
for class_idx in range(len(class_names)):
    class_mask = (y_test == class_idx)
    class_confidence = prediction_confidence[class_mask]
    ax3.hist(class_confidence, alpha=0.6, label=class_names[class_idx], bins=15)

ax3.set_xyt('Prediction Confidence', 'Frequency', 'Confidence by True Class')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Decision boundary visualization (2D projection)
ax4 = axes[1, 1]
from sklearn.decomposition import PCA

# Project to 2D for visualization
pca = PCA(n_components=2)
X_test_2d = pca.fit_transform(X_test_selected)

# Plot test points colored by true class
colors = ['blue', 'red', 'green']
for class_idx in range(len(class_names)):
    mask = (y_test == class_idx)
    ax4.scatter(X_test_2d[mask, 0], X_test_2d[mask, 1], 
               c=colors[class_idx], label=class_names[class_idx], 
               alpha=0.6, s=30)

# Mark misclassified points
misclassified = ~correct_predictions
ax4.scatter(X_test_2d[misclassified, 0], X_test_2d[misclassified, 1], 
           marker='x', s=100, c='black', label='Misclassified')

ax4.set_xyt('First Principal Component', 'Second Principal Component', 
            'Test Set Visualization (PCA)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, "./figures/model_interpretability.png", symlink_from_cwd=True)
plt.show()

## Summary

This notebook demonstrated the comprehensive machine learning capabilities of SciTeX's AI module:

### Key Features Covered:

1. **Classification Workflows**: Complete pipeline with SciTeX ClassificationReporter
2. **Feature Analysis**: Importance ranking, selection, and correlation analysis
3. **Hyperparameter Optimization**: Grid search and validation curves
4. **Neural Networks**: Multiple architectures with training analysis
5. **Learning Diagnostics**: Learning curves and bias-variance analysis
6. **Model Interpretability**: Feature importance and prediction confidence

### SciTeX AI Advantages:

- **Automated Reporting**: Comprehensive evaluation with ROC curves, confusion matrices
- **Feature Management**: Built-in feature selection and importance analysis
- **Visualization Integration**: Seamless plotting with data export
- **Best Practices**: Proper validation, scaling, and diagnostic workflows
- **Extensibility**: Easy integration with scikit-learn and other ML libraries

### Best Practices Demonstrated:

- Always split data properly and use cross-validation
- Scale features for algorithms that need it (SVM, Neural Networks)
- Perform feature selection to reduce overfitting
- Optimize hyperparameters systematically
- Analyze learning curves to diagnose bias/variance
- Evaluate model confidence and interpretability

### Next Steps:
- Explore `scitex.nn` for advanced neural network architectures
- Use `scitex.dsp` for feature extraction from signal data
- Check `scitex.stats` for statistical validation of ML results
- Try `scitex.ai.genai` for LLM integration in ML workflows

In [None]:
# Save comprehensive ML results
ml_results = {
    'dataset_info': {
        'n_samples': len(X),
        'n_features': X.shape[1],
        'n_classes': len(class_names),
        'class_names': class_names,
        'feature_names': feature_names
    },
    'feature_selection': {
        'selected_features': top_features,
        'feature_importance': feature_df.to_dict('records')
    },
    'model_performance': {
        'baseline': {name: result['report']['accuracy'] for name, result in results.items()},
        'optimized': {name: result['test_accuracy'] for name, result in optimized_models.items()},
        'neural_networks': {name: result['report']['accuracy'] for name, result in nn_results.items()}
    },
    'best_models': {
        'random_forest': {
            'params': optimized_models['Random Forest']['best_params'],
            'accuracy': optimized_models['Random Forest']['test_accuracy']
        },
        'svm': {
            'params': optimized_models['SVM']['best_params'],
            'accuracy': optimized_models['SVM']['test_accuracy']
        }
    },
    'interpretability': {
        'feature_importance_comparison': importance_df.to_dict('records'),
        'prediction_confidence': {
            'mean_confidence': float(np.mean(prediction_confidence)),
            'correct_confidence': float(np.mean(prediction_confidence[correct_predictions])),
            'incorrect_confidence': float(np.mean(prediction_confidence[~correct_predictions]))
        }
    }
}

stx.io.save(ml_results, "./data/ml_analysis_results.json", symlink_from_cwd=True)
stx.io.save(X_test_selected, "./data/test_features.npy", symlink_from_cwd=True)
stx.io.save(y_test, "./data/test_labels.npy", symlink_from_cwd=True)

print("\n✅ Machine Learning analysis complete!")
print("🤖 Results saved to ./data/ml_analysis_results.json")
print("📈 Figures saved to ./figures/")
print("🔬 Test data saved for further analysis")

# Final summary
best_accuracy = max([result['test_accuracy'] for result in optimized_models.values()] + 
                   [result['report']['accuracy'] for result in nn_results.values()])
print(f"\n🏆 Best model accuracy: {best_accuracy:.3f}")