# Exploration des données

In [None]:

import numpy as np #  manip des arrays (tableaux)
import pandas as pd #  manip des dataframes
import matplotlib.pyplot as plt #  construct des graphes : barplot, cammenberts
import seaborn as sns

df = pd.read_csv('processed.cleveland.csv', names=['age','sex','chest pain','blood pressure(r)','cholestoral','fasting blood sugar','eleccardio results(r)','max heart rate achvd','exercise angina','ST depression (ex to r)','slope exercise ST','number of vessels','thalassemia','diagnosis'])
df.describe()

df.describe()
df1 = df

In [None]:
df.boxplot(figsize=[8,6], rot=70)

In [None]:
# from pandas.plotting import scatter_matrix
# scatter_mat = scatter_matrix(df, figsize = (20,16))

axes = pd.plotting.scatter_matrix(df, alpha=0.2, figsize = (16,12))
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

plt.tight_layout()
plt.gcf().subplots_adjust(wspace=0, hspace=0)
plt.show()

In [None]:
df_corr = df.corr()
df_corr

In [None]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
            props=[("font-size", "7pt")]),
            dict(selector="td",
            props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
            props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
            props=[('max-width', '200px'),
            ('font-size', '12pt')])
        ]
        
df_corr.style.background_gradient(cmap, axis=1).set_properties(**{'hello':'80px','font-size':'10pt'}).set_caption("Hover to magify").set_precision(2).set_table_styles(magnify())



In [None]:
# Régression linéaire ajustée au graphique précédent
sns.regplot(x=df["max heart rate achvd"], y=df["age"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='exercise angina', y='chest pain', style='o')
sns.regplot(x=df["exercise angina"], y=df["chest pain"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='exercise angina', y='chest pain', style='o')
sns.regplot(x=df["cholestoral"], y=df["diagnosis"], fit_reg=True)
plt.show()

In [None]:
# df.plot(x='blood pressure(r)', y='cholestoral', style='o')
sns.regplot(x=df["blood pressure(r)"], y=df["cholestoral"], fit_reg=True)
plt.show()


In [None]:
graph = df.groupby(['age'])['age'].count()
graph.plot(kind = 'bar', figsize=(16, 10))

In [None]:
graph = df.groupby(['sex'])['sex'].count()
graph.plot(kind = 'bar', figsize=(16, 10))

In [None]:
# On nettoie les valeurs nulles
df = df[df["number of vessels"].str.contains("\?")==False]
df = df[df["thalassemia"].str.contains("\?")==False]

df.describe()


In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()

df_minmax = min_max.fit_transform(df)
df_minmax = pd.DataFrame(df_minmax)
df_minmax.boxplot(rot = 45,figsize=(16, 10))

In [None]:
from sklearn.preprocessing import scale

df_scale = scale(df)
df_scale
pd.DataFrame(df_scale).boxplot(rot = 45,figsize=(16, 10))
df_scale = pd.DataFrame(df_scale)

# Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df_minmax.iloc[:,0:13]
y = df_minmax.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=42, stratify=y)

# FIXME : Restore initial classes
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
le.fit(y_test)
y_test = le.transform(y_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_scale = df_scale.iloc[:,0:13]
y_scale = df.iloc[:,-1]

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_scale, y_scale, test_size=1/4, random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(n_neighbors=15)
knn1.fit(X_train1, y_train1)
knn1.effective_metric_

## Evaluer avec scale

In [None]:
# y_pred1, X_scale, y_scale scaler avec la méthode scale
y_pred1 = knn1.predict(X_test1)

from sklearn import metrics

def evaluate_this(y_test1,y_pred1):
    print(metrics.accuracy_score(y_test1, y_pred1))
    print(metrics.f1_score(y_test1, y_pred1, average='weighted'))

evaluate_this(y_test1,y_pred1)

# Entraîner le modèle

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn.effective_metric_

# Prédir

In [None]:
y_pred = knn.predict(X_test)

# Evaluer

In [None]:
from sklearn import metrics

def evaluate_this(y_test,y_pred):
    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.f1_score(y_test, y_pred, average='weighted'))

evaluate_this(y_test,y_pred)

# Try to guess better K

## k-fold cross validation

In [None]:
# https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb pour l'approche basique
# https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/ pour le cross validation

from sklearn.model_selection import cross_val_score

# creating odd list of K for KNN
neighbors = list(range(1, 50, 2))

cv_scores = []
error_rate = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    error_rate.append(np.mean(y_pred != y_test))

# changing CV accuracy => to CV error
cv_errors = [1 - x for x in cv_scores]

# determining best k for both methods
cv_optimal_k = neighbors[cv_errors.index(min(cv_errors))]
error_optimal_k = neighbors[error_rate.index(min(error_rate))]

plt.figure(figsize=(10,6))
plt.plot(neighbors, cv_errors, color='green', linestyle='dashed', 
         marker='o', markersize=6)

plt.plot(neighbors, error_rate, color='grey', linestyle='dashed', 
         marker='o', markersize=6)


plt.title('CV errors AND Erros VS K')
plt.xlabel("K")
plt.ylabel("Error rate")

print("Minimum CV error : ", min(cv_errors)," % at K =", cv_optimal_k)
print("Minimum error: ", min(error_rate)," % at K =", error_optimal_k)


### Best K is 15 !

> 5 seems good, but it is probably overfiting

In [None]:
df = pd.DataFrame({"neighbors" : neighbors, "CV errors" : cv_errors, "Errors" : error_rate}, index=neighbors)
# df = df.sort_values('errors', ascending=True)
df = df.sort_values(['CV errors', 'neighbors'], ascending=[True, False])
df.head(15)

# Rerun model with new K

In [None]:
K = 15

knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

evaluate_this(y_test, y_pred)

In [None]:
from sklearn import neighbors, metrics
from sklearn.model_selection import GridSearchCV
# Fixer les valeurs des hyperparamètres à tester
param_grid = {'n_neighbors':[3, 5, 7, 9, 11, 13, 15]}

# Choisir un score à optimiser, ici l'accuracy (proportion de prédictions correctes)
score = 'accuracy'

# Créer un classifieur kNN avec recherche d'hyperparamètre par validation croisée
clf = GridSearchCV(
    neighbors.KNeighborsClassifier(), # un classifieur kNN
    param_grid,     # hyperparamètres à tester
    cv=5,           # nombre de folds de validation croisée
    scoring=score   # score à optimiser
)

# Optimiser ce classifieur sur le jeu d'entraînement
clf.fit(X_train, y_train)

# Afficher le(s) hyperparamètre(s) optimaux
print("Meilleur(s) hyperparamètre(s) sur le jeu d'entraînement:")
print(clf.best_params_)

In [None]:
# Afficher les performances correspondantes
print("Résultats de la validation croisée :")
for mean, std, params in zip(
        clf.cv_results_['mean_test_score'], # score moyen
        clf.cv_results_['std_test_score'],  # écart-type du score
        clf.cv_results_['params']           # valeur de l'hyperparamètre
    ):

    print("{} = {:.3f} (+/-{:.03f}) for {}".format(
        score,
        mean,
        std*2,
        params
    ) )

# La méthode OneHotEncoder

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X_train_scale=scale(X_train)
X_test_scale=scale(X_test)

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(sparse=False)

df1 = df1[df1["number of vessels"].str.contains("\?")==False]
df1 = df1[df1["thalassemia"].str.contains("\?")==False]

X_a = df1.iloc[:,0:13]
y_a = df1.iloc[:,-1]

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size=1/4, random_state=42, stratify=y)

In [None]:
columns=['age','sex','chest pain','blood pressure(r)','cholestoral',
         'fasting blood sugar','eleccardio results(r)','max heart rate achvd','exercise angina',
         'ST depression (ex to r)','slope exercise ST','number of vessels','thalassemia']

for col in columns:
       # creating an exhaustive list of all possible categorical values
       data=X_train_a[[col]].append(X_test_a[[col]])
       enc.fit(data)
       # Fitting One Hot Encoding on train data
       temp = enc.transform(X_train_a[[col]])
       # Changing the encoded features into a data frame with new column names
       temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
       # In side by side concatenation index values should be same
       # Setting the index values similar to the X_train data frame
       temp=temp.set_index(X_train_a.index.values)
       # adding the new One Hot Encoded varibales to the train data frame
       X_train_1=pd.concat([X_train_a,temp],axis=1)
       # fitting One Hot Encoding on test data
       temp = enc.transform(X_test_a[[col]])
       # changing it into data frame and adding column names
       temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
       # Setting the index for proper concatenation
       temp=temp.set_index(X_test_a.index.values)
       # adding the new One Hot Encoded varibales to test data frame
       X_test_1=pd.concat([X_test_a,temp],axis=1)
    
X_train_scale=scale(X_train_1)
X_test_scale=scale(X_test_1)
# Fitting a logistic regression model

for x in range(1,100):
     y = x * 0.01
     log=LogisticRegression(penalty='l2',C=y)
     log.fit(X_train_scale,y_train)
     print(accuracy_score(y_test,log.predict(X_test_scale)))
     #print(y)

#log=LogisticRegression(penalty='l2',C=1)
#log.fit(X_train_scale,y_train_a)
# Checking the model's accuracy
#accuracy_score(y_test_a,log.predict(X_test_scale))