<a href="https://colab.research.google.com/github/rhodes-byu/cs-stat-180/blob/main/notebooks/11-intro-ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import randint

sns.set(style = "darkgrid")

### **Loading a Dataset**

In [None]:
data = datasets.load_breast_cancer() # Pick a dataset: iris, wine, breast_cancer
print(data.keys())


In [None]:
print(data.feature_names)
print(data.target_names)

In [None]:
X = data.data  # Features
y = data.target  # Labels

df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

df.head()

### **Data Summary**

In [None]:
print(df.info())

# Check if the target variable is balanced
target_counts = df['target'].value_counts()
print(target_counts)

# Plot the distribution of the target variable
sns.countplot(x = 'target', data = df)
plt.title('Distribution of Target Variable')
plt.show()



### **Splitting Data into Training and Testing Sets**

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

### **Training and Evaluating a K-Nearest Neighbors (KNN) Classifier**

In [None]:
# Initialize the model
knn = KNeighborsClassifier(n_neighbors = 5)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))

### **Training and Evaluating a Logistic Regression Model**

In [None]:
# Initialize the model
log_reg = LogisticRegression(max_iter = 200)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))

### **Training and Evaluating a Decision Tree Classifier**

In [None]:
# Initialize the model
tree = DecisionTreeClassifier(random_state=42)

# Train the model
tree.fit(X_train, y_train)

# Make predictions
y_pred_tree = tree.predict(X_test)

# Evaluate the model
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))

In [None]:
### Visualizing the Decision Tree
plt.figure(figsize=(20, 10))
plot_tree(tree, filled = True, feature_names = data.feature_names, class_names = data.target_names);

### **Visualizing the Confusion Matrix**

A confusion matrix compares the actual labels with the predicted labels to show how many instances were correctly or incorrectly classified for each class.

In [None]:
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)

In [None]:
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for KNN")
plt.show()

### **Feature Scaling: StandardScaler**

In [None]:
# Standardize features (zero mean, unit variance)
scaler = StandardScaler()

# Fit the scaler on training data and transform both train and test sets
scaler.fit(X_train)

# Alternatively:
# X_train_scaled = scaler.fit_transform(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Mean of scaled features (train):", X_train_scaled.mean(axis=0))
print("Standard deviation of scaled features (train):", X_train_scaled.std(axis=0))

### **Feature Scaling: MinMaxScaler**

In [None]:
# Scale features to the [0, 1] range
minmax_scaler = MinMaxScaler()

# Fit the scaler on training data and transform both train and test sets
X_train_minmax = minmax_scaler.fit_transform(X_train)
X_test_minmax = minmax_scaler.transform(X_test)

print("Minimum of scaled features (train):", X_train_minmax.min(axis=0))
print("Maximum of scaled features (train):", X_train_minmax.max(axis=0))

print("Minimum of scaled features (test):", X_test_minmax.min(axis=0))
print("Maximum of scaled features (test):", X_test_minmax.max(axis=0))

### **Training a K-Nearest Neighbors (KNN) Classifier with Scaled Data**

In [None]:
# Initialize the model
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model on scaled data
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn_scaled = knn.predict(X_test_scaled)

# Evaluate the model
print("KNN Accuracy (scaled):", accuracy_score(y_test, y_pred_knn_scaled))
print("KNN Accuracy (not scaled):", accuracy_score(y_test, y_pred_knn))

### **Training a Decision Tree Classifier**

In [None]:
# Train the model
tree.fit(X_train_scaled, y_train)

# Make predictions
y_pred_tree_scaled = tree.predict(X_test_scaled)

# Evaluate the model
print("Decision Tree Accuracy (scaled):", accuracy_score(y_test, y_pred_tree_scaled))
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))


### **Training a Logistic Regression Model with Scaled Data**

In [None]:
# Initialize the model
log_reg = LogisticRegression(max_iter=200)

# Train the model on scaled data
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_log_reg_scaled = log_reg.predict(X_test_scaled)

# Evaluate the model
print("Logistic Regression Accuracy (scaled):", accuracy_score(y_test, y_pred_log_reg_scaled))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))

### **Visualizing the Confusion Matrix (Scaled Data)**

In [None]:
# Compute confusion matrix for one model (e.g., Logistic Regression with scaled data)
cm = confusion_matrix(y_test, y_pred_log_reg_scaled)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Logistic Regression (Scaled)")
plt.show()
