# Exploration des données

In [None]:

import numpy as np #  manip des arrays (tableaux)
import pandas as pd #  manip des dataframes
import matplotlib.pyplot as plt #  construct des graphes : barplot, cammenberts
import seaborn as sns

df = pd.read_csv('processed.cleveland.csv', names=['age','sex','chest pain','blood pressure(r)','cholestoral','fasting blood sugar','eleccardio results(r)','max heart rate achvd','exercise angina','ST depression (ex to r)','slope exercise ST','number of vessels','thalassemia','diagnosis'])
df.describe()

df.describe()

In [None]:
df.boxplot(figsize=[8,6], rot=70)

In [None]:
# from pandas.plotting import scatter_matrix
# scatter_mat = scatter_matrix(df, figsize = (20,16))

axes = pd.plotting.scatter_matrix(df, alpha=0.2, figsize = (16,12))
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

plt.tight_layout()
plt.gcf().subplots_adjust(wspace=0, hspace=0)
plt.show()

In [None]:
df_corr = df.corr()
df_corr

In [None]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
            props=[("font-size", "7pt")]),
            dict(selector="td",
            props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
            props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
            props=[('max-width', '200px'),
            ('font-size', '12pt')])
        ]
        
df_corr.style.background_gradient(cmap, axis=1).set_properties(**{'hello':'80px','font-size':'10pt'}).set_caption("Hover to magify").set_precision(2).set_table_styles(magnify())



In [None]:
# Régression linéaire ajustée au graphique précédent
sns.regplot(x=df["max heart rate achvd"], y=df["age"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='exercise angina', y='chest pain', style='o')
sns.regplot(x=df["exercise angina"], y=df["chest pain"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='exercise angina', y='chest pain', style='o')
sns.regplot(x=df["cholestoral"], y=df["diagnosis"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='blood pressure(r)', y='cholestoral', style='o')
sns.regplot(x=df["blood pressure(r)"], y=df["cholestoral"], fit_reg=True)
plt.show()


In [None]:
graph = df.groupby(['age'])['age'].count()
graph.plot(kind = 'bar', figsize=(16, 10))

In [None]:
graph = df.groupby(['sex'])['sex'].count()
graph.plot(kind = 'bar', figsize=(16, 10))

In [None]:
# On nettoie les valeurs nulles
df = df[df["number of vessels"].str.contains("\?")==False]
df = df[df["thalassemia"].str.contains("\?")==False]

df.describe()


In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()

df_minmax = min_max.fit_transform(df)
df_minmax = pd.DataFrame(df_minmax)
df_minmax.boxplot(rot = 45,figsize=(16, 10))

In [None]:
from sklearn.preprocessing import scale

df_scale = scale(df)
df_scale
pd.DataFrame(df_scale).boxplot(rot = 45,figsize=(16, 10))

# Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df_minmax.iloc[:,0:13]
y = df_minmax.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=42, stratify=y)

# FIXME : Restore initial classes
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
le.fit(y_test)
y_test = le.transform(y_test)

# Entraîner le modèle

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn.effective_metric_

# Prédir

In [None]:
y_pred = knn.predict(X_test)
 

# Evaluer

In [None]:
from sklearn import metrics

def evaluate_this(y_test,y_pred):
    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.f1_score(y_test, y_pred, average='weighted'))

evaluate_this(y_test,y_pred)

# Try to guess better K

## k-fold cross validation

In [None]:
# https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb pour l'approche basique
# https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/ pour le cross validation

from sklearn.model_selection import cross_val_score

# creating odd list of K for KNN
neighbors = list(range(1, 50, 2))

cv_scores = []
error_rate = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    error_rate.append(np.mean(y_pred != y_test))

# changing CV accuracy => to CV error
cv_errors = [1 - x for x in cv_scores]

# determining best k for both methods
cv_optimal_k = neighbors[cv_errors.index(min(cv_errors))]
error_optimal_k = neighbors[error_rate.index(min(error_rate))]

plt.figure(figsize=(10,6))
plt.plot(neighbors, cv_errors, color='green', linestyle='dashed', 
         marker='o', markersize=6)

plt.plot(neighbors, error_rate, color='grey', linestyle='dashed', 
         marker='o', markersize=6)


plt.title('CV errors AND Erros VS K')
plt.xlabel("K")
plt.ylabel("Error rate")

print("Minimum CV error : ", min(cv_errors)," % at K =", cv_optimal_k)
print("Minimum error: ", min(error_rate)," % at K =", error_optimal_k)


### Best K is 15 !

> 5 seems good, but it is probably overfiting

In [None]:
df = pd.DataFrame({"neighbors" : neighbors, "CV errors" : cv_errors, "Errors" : error_rate}, index=neighbors)
# df = df.sort_values('errors', ascending=True)
df = df.sort_values(['CV errors', 'neighbors'], ascending=[True, False])
df.head(15)

# Rerun model with new K

In [None]:
K = 15

knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

evaluate_this(y_test, y_pred)