---
title: "Supervised Learning"
format:
    html: 
        code-fold: false
---

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->

{{< include instructions.qmd >}} 

# Code 



In [None]:
#possible packages
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import f1_score

#getting data
#saved data for easy purpose
d1 = pd.read_csv('out.csv')
d1= d1.drop(columns=["Unnamed: 0","D_cluster"])
#spliting data
so_features = d1.drop(columns=['outs','cluster'])
so_target = d1['outs']
# features except categorical variable
pitch_index = ['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'release_extension']
#splitting data in to 80:20 ratio for train and test
x_train, x_test, y_train, y_test = train_test_split(so_features,so_target,test_size = 0.2)


#max_depth tuning process 
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def confusion_plot(y_data, y_pred):
    mat = confusion_matrix(y_data, y_pred)
    accuracy = round((mat.diagonal().sum() / mat.sum()) * 100, 2)
    negative_recall = recall_score(y_data, y_pred, pos_label=0)
    negative_precision = precision_score(y_data, y_pred, pos_label=0)
    positve_recall= recall_score(y_data, y_pred, pos_label=1)
    positve_precision = precision_score(y_data, y_pred, pos_label=1)
    
    print(f"ACCURACY: {accuracy}%")
    print(f"NEGATIVE RECALL(Y=0):{negative_recall}")
    print(f"NEGATIVE PRECISION(Y=0): {negative_precision}")
    print(f"POSITIVE RECALL(Y=1): {positve_recall}")
    print(f"POSITIVE PRECISION (Y=1): {positve_precision}")
    print(np.array(mat))
    ConfusionMatrixDisplay(confusion_matrix=mat).plot()
    plt.title("Confusion Matrix")
    plt.show()

test_results=[]
train_results=[]

for num_layer in range(1,20):
    model = tree.DecisionTreeClassifier(max_depth=num_layer)
    model = model.fit(x_train,y_train)

    yp_train=model.predict(x_train)
    yp_test=model.predict(x_test)

    test_results.append([num_layer,accuracy_score(y_test, yp_test),recall_score(y_test, yp_test,pos_label=0),recall_score(y_test, yp_test,pos_label=1)])
    train_results.append([num_layer,accuracy_score(y_train, yp_train),recall_score(y_train, yp_train,pos_label=0),recall_score(y_train, yp_train,pos_label=1)])


test_results = pd.DataFrame(test_results, columns=["Depth", "Accuracy", "Negative_recall", "Positive_recall"])
train_results = pd.DataFrame(train_results, columns=["Depth", "Accuracy", "Negative_recall", "Positive_recall"])

#recycled code from lecture note(plots)
plt.plot(train_results["Depth"], train_results["Accuracy"],  marker='o')
plt.plot(test_results["Depth"], test_results["Accuracy"],  marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Accuracy(Y = 0): Training (blue) and Test(red)")
plt.show()


plt.plot(train_results["Depth"], train_results["Negative_recall"], marker='o')
plt.plot(test_results["Depth"], test_results["Negative_recall"], marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Recall(Y = 0): Training (blue) and Test(red)")
plt.show()


plt.plot(train_results["Depth"], train_results["Positive_recall"],  marker='o')
plt.plot(test_results["Depth"], test_results["Positive_recall"], marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Recall(Y = 1): Training (blue) and Test(red)")
plt.show()
#fit model with max_depth = 4
from sklearn import tree
rf_model = tree.DecisionTreeClassifier(max_depth=4)
rf_model = rf_model.fit(x_train,y_train)
yp_train=rf_model.predict(x_train)
yp_test=rf_model.predict(x_test)
# getting feature importanc plot
rf_feature_importance = rf_model.feature_importances_
rf_features  = so_features.columns
rf_idx = rf_feature_importance.argsort()

# Creating the plot
plt.figure(figsize=(10, 8))
plt.barh(rf_features[rf_idx], rf_feature_importance[rf_idx], color='orange')
plt.xlabel('Importance',size= 10)
plt.title('Feature Importance from Random Forest Model',size = 15)
plt.show()
#confusion matrix and model evaluation
print("------TRAINING------")
confusion_plot(y_train,yp_train)
print("------TEST------")
confusion_plot(y_test,yp_test)


#model evaluation using rocv curve
from sklearn.metrics import roc_curve, roc_auc_score
rf_prob = rf_model.predict_proba(x_test)[:, 1]
auc_score = roc_auc_score(y_test, rf_prob)
fpr, tpr, thresholds = roc_curve(y_test, rf_prob)
auc_score = roc_auc_score(y_test, rf_prob)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest ')
plt.legend(loc="lower right")
plt.show()


#decision tree plot
from sklearn.tree import plot_tree
plt.figure(figsize=(80, 50))
plot_tree(
    rf_model,
    feature_names= x_train.columns,
    class_names=["in-play", "out"],
    filled=True,
    rounded=True
)
plt.title("decision tree with max_depth=4")
plt.show()


#starting second objective:  applying this to each clusters(cluster got from kmeans)
so_features_clusters = d1.drop(columns=['outs'])
cluster_list = []
roc_auc_list = {}

for cluster_id in so_features_clusters['cluster'].unique():
    cluster_data = so_features_clusters[so_features_clusters['cluster'] == cluster_id]
    cluster_target = so_target[so_features_clusters['cluster'] == cluster_id]
    
    x_train, x_test, y_train, y_test = train_test_split(
        cluster_data, cluster_target, test_size=0.2, stratify=cluster_target
    )
    
    cluster_model = RandomForestClassifier(max_depth=6)
    cluster_model.fit(x_train, y_train)
    y_pred = cluster_model.predict(x_test)


    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    cluster_list.append({
        'cluster': cluster_id,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1 score': f1
    })
    y_pred_prob = cluster_model.predict_proba(x_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    roc_auc_list[cluster_id] = roc_auc
    

    plt.plot(fpr, tpr, label=f'Cluster {cluster_id} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves by Cluster')
plt.legend(loc='lower right')
plt.grid()
plt.show()
    
# Convert the metrics list to a DataFrame
metrics_df = pd.DataFrame(cluster_list)
metrics_df
    


#optimizing like before, this time for each clusters
test_results=[]
train_results=[]
cluster_data = so_features_clusters[so_features_clusters['cluster'] == cluster_id]
cluster_target = so_target[so_features_clusters['cluster'] == cluster_id]
# Train-test split
x_train_cluster, x_test_cluster, y_train_cluster, y_test_cluster = train_test_split(
        cluster_data, cluster_target, test_size=0.2
    )

for num_layer in range(1,15):
    model = tree.DecisionTreeClassifier(max_depth=num_layer)
    model = model.fit(x_train_cluster,y_train_cluster)

    yp_train=model.predict(x_train_cluster)
    yp_test=model.predict(x_test_cluster)

    test_results.append([num_layer,accuracy_score(y_test_cluster, yp_test),recall_score(y_test_cluster, yp_test,pos_label=0),recall_score(y_test, yp_test,pos_label=1)])
    train_results.append([num_layer,accuracy_score(y_train_cluster, yp_train),recall_score(y_train_cluster, yp_train,pos_label=0),recall_score(y_train, yp_train,pos_label=1)])


test_results = pd.DataFrame(test_results, columns=["Depth", "Accuracy", "Negative_recall", "Positive_recall"])
train_results = pd.DataFrame(train_results, columns=["Depth", "Accuracy", "Negative_recall", "Positive_recall"])

plt.plot(train_results["Depth"], train_results["Accuracy"],  marker='o')
plt.plot(test_results["Depth"], test_results["Accuracy"],  marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Accuracy(Y = 0): Training (blue) and Test(red)")
plt.show()


plt.plot(train_results["Depth"], train_results["Negative_recall"], marker='o')
plt.plot(test_results["Depth"], test_results["Negative_recall"], marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Recall(Y = 0): Training (blue) and Test(red)")
plt.show()


plt.plot(train_results["Depth"], train_results["Positive_recall"],  marker='o')
plt.plot(test_results["Depth"], test_results["Positive_recall"], marker='o')
plt.xlabel("Number of layers in decision tree(max_depth)")
plt.ylabel("Recall(Y = 1): Training (blue) and Test(red)")
plt.show()
#max_depth will be 4 
#intitial storage for evaluation

cluster_evaluation = []  

for cluster_id in so_features_clusters['cluster'].unique():
    cluster_data = so_features_clusters[so_features_clusters['cluster'] == cluster_id]
    cluster_target = so_target[so_features_clusters['cluster'] == cluster_id]
    
    # Train-test split
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        cluster_data, cluster_target, test_size=0.2,stratify=cluster_target
    )

    # Train Random Forest Model
    cluster_model = RandomForestClassifier(max_depth =4)
    cluster_model.fit(X_train_c, y_train_c)

    # Predict and evaluate metrics
    y_pred_c = cluster_model.predict(X_test_c)
    metrics = {
        "Cluster": cluster_id,
        "Accuracy": accuracy_score(y_test_c, y_pred_c),
        "Precision": precision_score(y_test_c, y_pred_c, average='binary'),
        "Recall": recall_score(y_test_c, y_pred_c, average='binary'),
        "F1 Score": f1_score(y_test_c, y_pred_c, average='binary')
    }

    cluster_evaluation.append(metrics)


    y_pred_prob = cluster_model.predict_proba(x_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    roc_auc_list[cluster_id] = roc_auc
    

    plt.plot(fpr, tpr, label=f'Cluster {cluster_id} (AUC = {roc_auc:.2f})')
    


plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves by Cluster(k_means)')
plt.legend(loc='lower right')
plt.grid()
plt.show()


cluster_evaluations = pd.DataFrame(cluster_evaluation)
# Probability model using Random Forest

# Pitching index that will be used as clustering model
pitch_index = ['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'release_extension']
# Splitting data into 80:20 ratio for train and test
x_train_prob, x_test_prob, y_train_prob, y_test_prob = train_test_split(so_features_clusters,
                                                                        so_target, test_size=0.2, random_state=42)
# Fitting random forest model (using max_depth = 4 as before)
prob_rf = RandomForestClassifier(max_depth=4, random_state=42)
prob_rf.fit(x_train_prob, y_train_prob)
y_pred_prob = prob_rf.predict(x_test_prob)

# Evaluation criterias: accuracy score,precision score,recall score,f1 score
accuracy = accuracy_score(y_test_prob, y_pred_prob)
precision = precision_score(y_test_prob, y_pred_prob)
recall = recall_score(y_test_prob, y_pred_prob)
f1 = f1_score(y_test_prob, y_pred_prob)

# Storages to store results
cluster_high_success_summaries = {}
cluster_high_success_rates = {}
cluster_results_accuracy = []

# Using for loop to apply all clusters
for cluster_id in x_test_prob['cluster'].unique():

    # Filtering data for clusters
    cluster_data = x_test_prob[x_test_prob['cluster'] == cluster_id]
    cluster_target = y_test_prob[x_test_prob['cluster'] == cluster_id]
    
    # Train-test split for the cluster
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        cluster_data, cluster_target, test_size=0.2, stratify=cluster_target, random_state=42
    )
    
    # Use random forest model with same max_depth
    cluster_model = RandomForestClassifier(max_depth=4, random_state=42)
    cluster_model.fit(X_train_c, y_train_c)
    y_pred_c = cluster_model.predict(X_test_c)
    
    # Evaluation criteria
    cluster_accuracy = accuracy_score(y_test_c, y_pred_c)
    cluster_precision = precision_score(y_test_c, y_pred_c)
    cluster_recall = recall_score(y_test_c, y_pred_c)
    cluster_f1 = f1_score(y_test_c, y_pred_c)

    # Process to get  successful rate for getting out count for pitchers
  
    y_pred_proba_c = cluster_model.predict_proba(X_test_c)[:, 1]
    cluster_test_results = X_test_c.copy()
    cluster_test_results['outs'] = y_test_c
    cluster_test_results['out_probability'] = y_pred_proba_c
    high_success_conditions = cluster_test_results[cluster_test_results['out_probability'] > 0.75] # this probability can be changed 
    # but 0.75 is based on the average obp(on base percentage) in 2024 was around 0.312 so set higher criteria. 
    #https://www.mlb.com/glossary/standard-stats/on-base-percentage

    high_success_summary = high_success_conditions[pitch_index].mean()
# getting mean of index for 75% outs
    # Summarize high success cases
    cluster_high_success_summaries[cluster_id] = high_success_conditions[
        ['release_spin_rate', 'pfx_x', 'pfx_z', 'release_speed', 'release_extension']
    ].mean()

    # Store cluster accuracy results
    cluster_results_accuracy.append({
        'Cluster': cluster_id,
        'Accuracy': cluster_accuracy,
        'Precision': cluster_precision,
        'Recall': cluster_recall,
        'F1 Score': cluster_f1
    })

#make it to dataframe
cluster_accuracy_results = pd.DataFrame(cluster_results_accuracy)


cluster_results = {
    'high_success_summaries': cluster_high_success_summaries
}


