In [None]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, RocCurveDisplay
df = pd.read_csv("../input/heart-disease-dataset/heart.csv")
df.head()
df.shape
df['target'].value_counts().plot(kind="bar", color=["green", "blue"])
df.info()
df.isna().sum()
df['sex'].value_counts()
pd.crosstab(df.target, df.sex)
pd.crosstab(df.target, df.sex).plot(kind="bar")
plt.title("Heart disease frequency for sex")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.legend(["Female", "Male"])
plt.xticks(rotation = 0)
plt.figure(figsize=(10,6))
plt.scatter(df.age[df.target==1], df.thalach[df.target == 1], c="red")
plt.scatter(df.age[df.target==0], df.thalach[df.target == 0], c="green")

plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Heart Rate")
plt.legend(["Disease", "No Disease"])
df["age"][df["target"] == 1]
df["age"][df["target"] == 0]
df.age.plot.hist();
pd.crosstab(df.cp, df.target).plot(kind="bar")

plt.title("Heart Disease frequency per chest pain type")
plt.xlabel("Chest Pain Type")
plt.ylabel("Amount")
plt.legend(["No Disease", "Disease"])
plt.xticks(rotation=0)
corr_matrix = df.corr()
plt.subplots(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, linewidths=0.5, fmt=".2f", cmap="YlGnBu");
X = df.drop('target', axis=1)
y = df["target"]
X
y
models = {"Logistic Regression": LogisticRegression(), 
          "KNN": KNeighborsClassifier(), 
          "Random Forest": RandomForestClassifier()}
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fit and evaluate given machine learning model.
    model: a dict of different scikit-learn ML models
    X_train: training data (no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: testing labels
    """
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)
model_scores
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar()
train_scores = []
test_scores = []

neighbors = range(1,21)
knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))
    train_scores
    test_scores
    plt.plot(neighbors, train_scores, label='Train Scores')
plt.plot(neighbors, test_scores, label="Test Scores")
plt.xlabel("number of neighbors")
plt.ylabel("model score")

print(f"Maximum KNN Score on test data: {max(test_scores)*100:.2f}%")
log_reg_grid = {
    "C": np.logspace(-4, 4, 20),
    "solver": ["liblinear"]
}

rf_grid = {
    "n_estimators": np.arange(10, 1000, 50),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2,20, 2),
    "min_samples_leaf": np.arange(1, 20, 2)
}
rf_grid
np.random.seed(42)
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                    param_distributions=log_reg_grid,
                                    cv=5,
                                    n_iter=20,
                                    verbose=True)
rs_log_reg.fit(X_train, y_train)
rs_log_reg.score(X_test, y_test)
np.random.seed(42)

rs_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=rf_grid, cv=5, n_iter=20, verbose=True)
rs_rf.fit(X_train, y_train)
rs_rf.score(X_test, y_test)
rs_rf.best_params_
log_reg_grid = {
    "C": np.logspace(-4, 4, 30),
    "solver": ["liblinear"]
}

gs_log_reg = GridSearchCV(LogisticRegression(),
                         param_grid=log_reg_grid,
                         cv=5,
                         verbose=True)

gs_log_reg.fit(X_test, y_test)
gs_log_reg.best_params_
y_preds = gs_log_reg.predict(X_test)
y_preds
RocCurveDisplay.from_estimator(gs_log_reg, X_test, y_test)
print(confusion_matrix(y_test, y_preds))
sns.set(font_scale=1.5)

def plot_conf_matrix(y_test, y_preds):
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True,
                    cbar=False)
    plt.xlabel("True Label")
    plt.ylabel("Predicted Label")

plot_conf_matrix(y_test, y_preds)
print(classification_report(y_test, y_preds))
gs_log_reg.best_params_
clf = LogisticRegression(C=0.7278953843983146, solver="liblinear")
cv_acc = cross_val_score(
    clf,
    X,
    y,
    cv=5,
    scoring="accuracy"
)
cv_acc = np.mean(cv_acc)
cv_acc
cv_precision = cross_val_score(
    clf,
    X,
    y,
    cv=5,
    scoring="precision"
)

cv_precision = np.mean(cv_precision)
cv_precision
cv_recall = cross_val_score(
    clf,
    X,
    y,
    cv=5,
    scoring="recall"
)

cv_recall = np.mean(cv_recall)
cv_recall
cv_f1 = cross_val_score(
    clf,
    X,
    y,
    cv=5,
    scoring="precision"
)

cv_f1 = np.mean(cv_f1)
cv_f1
cv_metrics = pd.DataFrame({
    "Accuracy": cv_acc,
    "Precision": cv_precision,
    "Recall": cv_recall,
    "F1": cv_f1
}, index=[0])
cv_metrics.T.plot.bar(title="Cross Validated classification metrics", legend=False);
gs_log_reg.best_params_
clf = LogisticRegression(C=0.7278953843983146 , solver="liblinear")
clf.fit(X_train, y_train)
clf.coef_
feature_dict = dict(zip(df.columns, list(clf.coef_[0])))
feature_dict
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="Feature Importance", legend=False);
