<a href="https://colab.research.google.com/github/unmtransinfo/ISBDSCourse/blob/main/python/ISBDS_ML_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center>Independent Study in Biomedical Data Science (ISBDS), UNM BIOMED 505</center>

# Tutorial: Supervised machine learning with Python, Pandas, Matplotlib and Scikit-Learn
  * Datasource: [UC Irvine ML Archive](https://archive.ics.uci.edu/)
  * Dataset: [Oxford Parkinson's Disease Detection Dataset](https://archive.ics.uci.edu/ml/datasets/Parkinsons)
  * Algorithms: Naïve Bayes and Neural Networks

In [None]:
import sys,os,re
import urllib.request
import pandas as pd
import numpy as np
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.neural_network
from sklearn.cluster import AgglomerativeClustering # hierarchical, Ward's
from scipy.cluster.hierarchy import dendrogram
import matplotlib, matplotlib.pyplot as plt

In [None]:
print(f"Pandas {pd.__version__}; Scikit-learn {sklearn.__version__}; Matplotlib {matplotlib.__version__}")

## Function for model performance evaluation:

In [None]:
def print_score(Ytrue, Ypred):
  precision = sklearn.metrics.precision_score(Ytrue, Ypred)
  recall = sklearn.metrics.recall_score(Ytrue, Ypred)
  f1 = sklearn.metrics.f1_score(Ytrue, Ypred)
  print(f"precision: {precision:.2f} ; recall: {recall:.2f} ; F1: {f1:.2f}")

### Plot dendogram function from [Scikit-learn docs](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogramhtml#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py)

In [None]:
def plot_dendrogram(model, **kwargs):
  # Create linkage matrix and then plot the dendrogram, with counts of samples under each node.
  counts = np.zeros(model.children_.shape[0])
  n_samples = len(model.labels_)
  for i, merge in enumerate(model.children_):
    current_count = 0
    for child_idx in merge:
      if child_idx < n_samples:
        current_count += 1  # leaf node
      else:
        current_count += counts[child_idx - n_samples]
    counts[i] = current_count
  linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
  dendrogram(linkage_matrix, **kwargs)

## Read dataset: Oxford Parkinson's Disease Detection Dataset

In [None]:
pd_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data", sep=",")

In [None]:
nrows,ncols = pd_df.shape
print(f"dataset ncols: {ncols} ; nrows: {nrows}:")
pd_df.head()

## Read metadata


In [None]:
metadata = urllib.request.urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.names").read()
print(metadata.decode('utf-8').strip())

## Histogram of Jitter(Abs)

In [None]:
tag = "MDVP:Jitter(Abs)"
n, bins, patches = plt.hist(pd_df[tag], 50, facecolor='#8888FF', alpha=0.75)
plt.xlabel(tag)
plt.title(f'Histogram of {tag}')
plt.grid(True)
plt.show()

## Separate input variables and class labels into X & Y arrays.

In [None]:
Y = pd_df["status"]
datacols = list(set(pd_df.columns) - set(["name", "status"]))
X = pd_df[datacols]

## Cluster using hierarchical Ward's algorithm

Clustering a.k.a. unsupervised learning.

In [None]:
clus = AgglomerativeClustering(compute_full_tree=True, n_clusters=None, distance_threshold=0)
clus.fit(X)
print(f"N: {X.shape[0]}; n_clusters: {clus.n_clusters_}; n_leaves: {clus.n_leaves_}; n_connected_components: {clus.n_connected_components_}")
#clus.distances_

### Plot the top three levels of the dendrogram

In [None]:
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(clus, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

## Create train/test split for supervised ML.

In [None]:
Xtrain,Xtest,Ytrain,Ytest = sklearn.model_selection.train_test_split(X, Y, test_size=.25)
print(f"training set: {Xtrain.shape[0]} ; test set: {Xtest.shape[0]}")

## Naïve Bayes

In [None]:
nb = sklearn.naive_bayes.GaussianNB() 
nb.fit(Xtrain, Ytrain)
Y_pred = nb.predict(Xtest)
conmat = sklearn.metrics.confusion_matrix(Ytest, Y_pred)
tn, fp, fn, tp = conmat.ravel()
print(f"TP: {tp} ; TN: {tn} ; FP: {fp} ; FN: {fn}")
print_score(Ytest, Y_pred)
pd.DataFrame(conmat, columns=["Predicted_Negative", "Predicted_Postive"], index=["Negative", "Positive"])

## Neural Network

In [None]:
nn = sklearn.neural_network.MLPClassifier() 
nn.fit(Xtrain, Ytrain)
y_pred = nn.predict(Xtest)
conmat = sklearn.metrics.confusion_matrix(Ytest, Y_pred)
tn, fp, fn, tp = conmat.ravel()
print(f"TP: {tp} ; TN: {tn} ; FP: {fp} ; FN: {fn}")
print_score(Ytest, Y_pred)
pd.DataFrame(conmat, columns=["Predicted_Negative", "Predicted_Postive"], index=["Negative", "Positive"])