<a href="https://colab.research.google.com/github/rhodes-byu/cs-stat-180/blob/main/notebooks/15a-cross-validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import randint

sns.set(style = "darkgrid")

### **Loading a Dataset**

In [None]:
data = datasets.load_breast_cancer() # Pick a dataset: iris, wine, breast_cancer
print(data.keys())


In [None]:
print(data.feature_names)
print(data.target_names)

In [None]:
X = data.data  # Features
y = data.target  # Labels

df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

df.head()

### **Data Summary**

In [None]:
print(df.info())

# Check if the target variable is balanced
target_counts = df['target'].value_counts()
print(target_counts)

# Plot the distribution of the target variable
sns.countplot(x = 'target', data = df)
plt.title('Distribution of Target Variable')
plt.show()



### **Splitting Data into Training and Testing Sets**

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

### **Training and Evaluating a K-Nearest Neighbors (KNN) Classifier**

In [None]:
# Initialize the model
knn = KNeighborsClassifier(n_neighbors = 5)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))

### **Cross-Validation**

In [None]:
# Perform 5-fold cross-validation for each model
scores = cross_val_score(knn, X, y, cv=5, scoring = 'f1_weighted')
print(f"{knn} Cross-Validation Score: {scores.mean():.2f} Â± {scores.std():.2f}")

### **Hyperparameter Tuning**: GridSearchCV

In [None]:
# Example: Hyperparameter tuning for KNN
param_grid_knn = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1)
grid_knn.fit(X_train, y_train)

print("Best parameters for KNN:", grid_knn.best_params_)
print("Best cross-validation score for KNN:", grid_knn.best_score_)

In [None]:
# Test on the test set
best_knn = grid_knn.best_estimator_
y_pred_knn_tuned = best_knn.predict(X_test)
print("Test accuracy for best KNN model:", accuracy_score(y_test, y_pred_knn_tuned))

In [None]:
sns.histplot(grid_knn.cv_results_['mean_test_score'])
plt.xlabel('Mean Test Score')
plt.ylabel('Frequency')
plt.title('Distribution of Mean Test Scores for KNN')
plt.show()

### **Visualizing Hyperparameter Search Results**

In [None]:
# Visualize GridSearchCV results for KNN
results_knn = pd.DataFrame(grid_knn.cv_results_)
results_knn_pivot = results_knn.pivot_table(index='param_n_neighbors',
                                            columns='param_weights', 
                                            values='mean_test_score')

plt.figure(figsize=(10, 6))
sns.heatmap(
    results_knn_pivot,
    annot=True, fmt=".3f", cmap="viridis"
)
plt.title("KNN Hyperparameter Search Results")
plt.ylabel("Number of Neighbors")
plt.xlabel("Metric")
plt.show()