In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import cv2 as cv
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Table of Contents

[Regression](#regression)

[Logistic Regression](#logistic_regression)

[Confusion Matrix](#confusion_matrix)

[ROC_AUC Curve](#roc_auc)

[Decision Tree](#decision_trees)


<a id="regression"></a>
# What is Regression?

Regression is a method to find out relationship between dependent variable and one or more independent variables. Regression provides the strength of the relationship. 

### We fit line of regrssion by using formulae y = mx + b, where m is slope and y is intercept.

But for linear classification this regression line might not be appropriate. Lets see why?

In [None]:
# Load data

heart = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
# Remove duplicates from the data.

heart = heart.drop_duplicates()
heart.shape

In [None]:
# Describe statistics for numerical fields of data.

heart.describe()

In [None]:
# Get slope and intercept using numpy library.

m,b = np.polyfit(heart['age'], heart['trestbps'], 1) # m is slope and b is intercept
print(m,b)

In [None]:
# Calculate slope and intercept manually.

def best_fit_slope_and_intercept(xs,ys):
    m = (((np.mean(xs)*np.mean(ys)) - np.mean(xs*ys)) /
         ((np.mean(xs)*np.mean(xs)) - np.mean(xs*xs)))
    
    b = np.mean(ys) - m*np.mean(xs)
    
    return m, b

In [None]:
m1, b1 = best_fit_slope_and_intercept(heart['age'].values, heart['trestbps'].values)
print(m1, b1)

In [None]:
# Get regression line. It gives new y values using calculated slope and intercept.

regression_line = [(m*x)+b for x in heart['age'].values]

In [None]:
# Plot regression line 

plt.figure(figsize=(15,5))
plt.scatter(heart['age'], heart['trestbps'], color='#003F72')
plt.plot(heart['age'], regression_line)

In [None]:
#Lets plot BP and target for all heart patients and fit regression line for the data.

style.use('ggplot')
heart_temp = heart.copy()
m,b = np.polyfit(heart_temp['trestbps'], heart_temp['target'], 1)
regression_line = [(m*x)+b for x in heart_temp['trestbps'].values]
plt.figure(figsize=(15,5))
plt.scatter(heart_temp['trestbps'], heart_temp['target'], color='#003F72')
plt.plot(heart_temp['trestbps'], regression_line)

In [None]:
# If we change existing data points of BP to 300 where ever BP is 200, our regression line predict wrongly BP with value 300. 
# This shows that linear regression is very sensitive to outliers. And if we get more such data fitted line can misclassify 
# many more data points. This is one of the reasons why linear regression does not fit for classifying binary outputs i.e.
# 0's and 1's. And that's where we use logistic regression.

heart_temp = heart.copy()
heart_temp['trestbps'].replace({200:300}, inplace=True)
m,b = np.polyfit(heart_temp['trestbps'], heart_temp['target'], 1)
#regression_line = [(m*x)+b for x in heart_temp['trestbps'].values]
regression_line = np.dot(m, heart_temp['trestbps']) + b
plt.figure(figsize=(15,5))
plt.scatter(heart_temp['trestbps'], heart_temp['target'], color='#003F72')
plt.plot(heart_temp['trestbps'], regression_line)

In [None]:
#target - have disease or not (1=yes, 0=no)
sns.countplot(x="target", data=heart, palette="bwr")
plt.show()

<a id="logistic_regression"></a>

# Logistic Regression

To solve this issue we need some formulae which provides values between 0 to 1 that means probability of being either true or false. Such formulae is sigmoid or logit function. Algorithm which use sigmoid is Logistic Regression.

$$ h_ \theta (x) =  \frac{\mathrm{1} }{\mathrm{1} + e^- \theta^Tx }  $$ 

In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [None]:
z = heart['restecg']
a = sigmoid(z)
print(a)

In [None]:
plt.scatter(z, a)

In [None]:
heart.columns

In [None]:
y = heart.target.values
x = heart.drop(['target'], axis = 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
accuracies = {}

lr = LogisticRegression()
lr.fit(x_train,y_train)
acc = lr.score(x_test,y_test)*100

accuracies['Logistic Regression'] = acc
print("Test Accuracy {:.2f}%".format(acc))


<a id="confusion_matrix"></a>

# Confusion matrix 
Confusion matrix provides data which describes performance of the classification model. Matrix data is calculated on test data for which true values are known. It gives number for correct and incorrect predictions made by the model.

Data in four columns represents:

True Positive - Actually positive and predicted also positive.

False Positive - Actually positive but predicted as negative.

False Negative - Actually negative but predicted as positive.

True Negative - Actually negative and predicted also negative.

Accuracy = True positive + True Negative / (TP + TN + FP + FN)

False positive also known as Type 1 error

False Negative also known as Type 2 error

Type 2 errors are more dangerous than type 1


In [None]:
score = pd.DataFrame({"Predicted":lr.predict(x_test),"Actual":y_test})
pd.options.display.max_rows=100
score

In [None]:
confusion_matrix(score.Actual, score.Predicted)

In [None]:
#precision = TP/TP+FP
precision = 22/26 *100
precision

In [None]:
accuracy = 52/61 * 100
accuracy

<a id="roc_auc"></a>

# ROC-AUC curve

ROC curve helps in determining what should be the right threshold value.

On this curve on y axis we plot sensitivity i.e out of total positive or true cases how much were predicted correctly i.e what proportion of data was correctly classified out of total positive or true values.

Sensitivity/TPR/Recall = TP/TP+FN

Specificity = TN/TN + FP

On x axis we plot False Positive rate i.e ration of negative cases predicted as positive.
FPR = 1-Specificity = FP/FP+TN

Precision = TP / TP + FP

Some times in data with imbalances like people not having heart disease are much much more than people having disease we can replace FPR with Precision.


AUC gives area under curve and helps in comparring the models. More AUC is better than less AUC.

In below curve the classes are seperated correctly and it is a ideal situation. Positive classes are classified as positive and negative classes considered as negative. AUC=1

![ROC Class Separability](https://miro.medium.com/max/528/1*Uu-t4pOotRQFoyrfqEvIEg.png)

![ROC AUC Curve](https://miro.medium.com/max/323/1*HmVIhSKznoW8tFsCLeQjRw.png)

In super important domains like risky health diseases we cannot efford wrong prediction of type2 errors, so we reduce threshold value. Suppose we reduce threshold to 0.3 so anything of higher probability from 0.3 will be considered as True(that means person has disease).

Second situation where there is some overlap of positive and negative classes 
i.e few positives are classified as negative and few negatives are classified as positive. Overlapping introduce type1 and type2 errors. Depending upon the threshold, we can minimize or maximize them. When AUC is 0.7, it means there is 70% chance that model will be able to distinguish between positive class and negative class.

![Overlapping classes](https://miro.medium.com/max/507/1*yF8hvKR9eNfqqej2JnVKzg.png)

ROC AUC curve will look like

![ROC_AUC_Overlapping](https://miro.medium.com/max/340/1*-tPXUvvNIZDbqXP0qqYNuQ.png)


Worst case where model not able to classify anything. In any situation model can not classify anything. AUC = 0.5
TPR = 1 that means all positive cases are correctly classified as positive.
FPR = 1 that means all negative cases are incorrectly classified as positive.

The point on ROC(1,1) means even we classified all positives correctly but all negatives are misclassified as positive.


AUC looks like 

![AUC_0.5](https://miro.medium.com/max/430/1*iLW_BrJZRI0UZSflfMrmZQ.png)

![ROC_AUC_0.5](https://miro.medium.com/max/363/1*k_MPO2Q9bLNH9k4Wlk6v_g.png)

AUC = 0 
All positives considered as negatives and all negatives considered as positives.

![AUC_0](https://miro.medium.com/max/556/1*aUZ7H-Lw74KSucoLlj1pgw.png)

![ROC_AUC_0](https://miro.medium.com/max/300/1*H7JGQbaa06BUab6tvGNZKg.png)

Source of all images - https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

In [None]:
auc = roc_auc_score(score.Actual, score.Predicted)
print(auc)

In [None]:
fpr, tpr, thresholds = roc_curve(score.Actual, score.Predicted)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr)

<a id="decision_trees"></a>
       
# Decision Trees

Decision trees are flowcharts like trees where every node is a test criteria. Every branch is output of the test and each leaf node holds a class label.
Decision trees can work on high dimensional data. Decision trees does not require domain knowledge.

In [None]:
#feature_names = ['age', 'sex', 'cp', 'ca', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach','thal']
feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [None]:
X = heart[feature_names]
Y = heart.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

In [None]:
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=9)
dtree.fit(x_train, y_train)


In [None]:
dtree.score(x_test, y_test) * 100

In [None]:
score_dtree = pd.DataFrame({"Predicted":dtree.predict(x_test),"Actual":y_test})
pd.options.display.max_rows=100
confusion_matrix(score_dtree.Actual, score_dtree.Predicted)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dtree, 
                   feature_names= list(feature_names),  
                   class_names=list(['1','0']),
                   filled=True)

In [None]:
X = heart.drop('target', axis=1)
Y = heart.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
rfc = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0, max_depth=10)
rfc.fit(x_train, y_train)
rfc.score(x_test, y_test) * 100

In [None]:
score_rfc = pd.DataFrame({"Predicted":rfc.predict(x_test),"Actual":y_test})
pd.options.display.max_rows=100
confusion_matrix(score_rfc.Actual, score_rfc.Predicted)

In [None]:
estimator_limited = rfc.estimators_[0]
estimator_limited


In [None]:
from sklearn.tree import export_graphviz

export_graphviz(estimator_limited, out_file='tree_limited.dot', feature_names = list(feature_names),
                class_names = list(set(str(heart['target']))),
                rounded = True, proportion = False, precision = 2, filled = True)

In [None]:
!dot -Tpng tree_limited.dot -o tree_limited.png -Gdpi=600

In [None]:
from IPython.display import Image
Image(filename = 'tree_limited.png')

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
auc_dtree = roc_auc_score(score_dtree.Actual, score_dtree.Predicted)
print(auc_dtree)
fpr_dtree, tpr_dtree, thresholds_dtree = roc_curve(score_dtree.Actual, score_dtree.Predicted)
plot_roc_curve(fpr_dtree, tpr_dtree)

In [None]:
auc_rfc = roc_auc_score(score_rfc.Actual, score_rfc.Predicted)
print(auc_rfc)
fpr_rfc, tpr_rfc, thresholds_rfc = roc_curve(score_rfc.Actual, score_rfc.Predicted)
print(thresholds_rfc)
plot_roc_curve(fpr_rfc, tpr_rfc)

In [None]:
plt.figure(0).clf()
plt.plot(fpr, tpr, color='orange', label="Logistic Regression, auc="+str(accuracy))
plt.plot(fpr_dtree, tpr_dtree, color='blue', label="Decision Tree, auc="+str(auc_dtree))
plt.plot(fpr_rfc, tpr_rfc, color='red', label="Random Forest, auc="+str(auc_rfc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()