## Question 2

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utils import styled_print, download_data, read_and_clean_data, \
     plot_scatter_plot

### Section B - Unsupervised Machine Learning

In [3]:
cleveland_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

In [4]:
headers = {
    0: "age",
    1: "sex",
    2: "cp",
    3: "trestbps",
    4: "chol",
    5: "fbs",
    6: "restecg",
    7: "thalach",
    8: "exang",
    9: "oldpeak",
    10: "slope",
    11: "ca",
    12: "thal",
    13: "target"
}

In [5]:
styled_print(f"Heart Disease Data Analysis", header=True)
styled_print(f"Extracting Data From {cleveland_url}")
cleveland_file = download_data(cleveland_url, path_to_download="./data")
cleveland_df = read_and_clean_data(cleveland_file, header=headers.values())

[1m› [4mHeart Disease Data Analysis[0m
    Extracting Data From http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data


ConnectionError: HTTPConnectionPool(host='archive.ics.uci.edu', port=80): Max retries exceeded with url: /ml/machine-learning-databases/heart-disease/processed.cleveland.data (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f9b60f460d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [None]:
styled_print(f"Cleveland Dataframe Info", header=True)
cleveland_df.info()

In [None]:
categorical_columns = ["cp", "restecg", "slope", "thal", "ca"]
binary_columns = ["sex", "fbs", "exang"]

continuous_columns = ["age", "trestbps", "chol", "thalach", "oldpeak"]
discrete_columns = categorical_columns + binary_columns
target_column = ["target"]

In [None]:
og_data_df = cleveland_df.copy()
og_data_df.head()

In [None]:
og_data_df = og_data_df.dropna()
og_data_df.isnull().sum()

In [None]:
data_df = og_data_df.drop(target_column[0], axis=1)

In [None]:
# Function to create Cluster Map for Better Visualization and Filtering
def create_cluster_map(data, model):
    cluster_map = pd.DataFrame()
    cluster_map['ClusterID'] = model.labels_
    cluster_map = pd.concat([data, cluster_map], axis=1)
    return cluster_map

def get_centroid(data, kmeans):
    centroids = pd.DataFrame(kmeans.cluster_centers_, columns=data.columns)
    return centroids

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0, init='random', n_init='auto').fit(data_df)
cluster_map = create_cluster_map(og_data_df, kmeans)

styled_print(f"Cluster Map from Kmeans with Random Cluster Initialization", header=True)
print(cluster_map.head(10))

styled_print(f"Cluster Centroid from Kmeans with Random Cluster Initialization", header=True)
print(get_centroid(og_data_df.drop(target_column[0], axis=1), kmeans).head(10))

In [None]:
plot_scatter_plot(
    df=cluster_map, 
    x="age", 
    y="chol", 
    hue='ClusterID',
    title="Scatter Plot with Random Centroid Initialization", 
    figsize=(4, 4), 
    dpi=300)

In [None]:
sse = {} 
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, random_state=0, init='random', n_init='auto').fit(data_df)
    sse[k] = kmeans.inertia_

# Elbow plot
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
sc = {} 
for k in range(2, 15):
    kmeans = KMeans(n_init='auto', n_clusters=k, init='random', random_state=0).fit(data_df)
    labels = kmeans.predict(data_df)
    sc[k] = silhouette_score(data_df, labels)

# Elbow plot
plt.figure()
plt.plot(list(sc.keys()), list(sc.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0, init='random', n_init='auto').fit(data_df)
cluster_map = create_cluster_map(og_data_df, kmeans)

styled_print(f"Cluster Map from Kmeans with Random Cluster Initialization", header=True)
print(cluster_map.head(10))

styled_print(f"Cluster Centroid from Kmeans with Random Cluster Initialization", header=True)
print(get_centroid(og_data_df.drop(target_column[0], axis=1), kmeans).head(10))

In [None]:
plot_scatter_plot(
    df=cluster_map, 
    x="chol", 
    y="age", 
    hue='ClusterID',
    title="Scatter Plot with Random Centroid Initialization", 
    figsize=(4, 4), 
    dpi=300)