<a href="https://colab.research.google.com/github/velaga-9/Health-Risk-Alert-System/blob/main/MINOR_cv%3D3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Data Exploration**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
data=pd.read_csv('/content/drive/MyDrive/data.csv',encoding='utf-8-sig')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22269 entries, 0 to 22268
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Age        22269 non-null  int64
 1   HeartBeat  22269 non-null  int64
 2   Systolic   22269 non-null  int64
 3   Diastolic  22269 non-null  int64
 4   Sugar      22269 non-null  int64
 5   Alert      22269 non-null  int64
dtypes: int64(6)
memory usage: 1.0 MB


In [None]:
#To check if any null values r presnt
data.isnull().sum()

Age          0
HeartBeat    0
Systolic     0
Diastolic    0
Sugar        0
Alert        0
dtype: int64

In [None]:
#Correlation Matrixx
data.corr()

Unnamed: 0,Age,HeartBeat,Systolic,Diastolic,Sugar,Alert
Age,1.0,0.026617,0.000228,0.000196,0.000264,0.000966
HeartBeat,0.026617,1.0,0.053563,0.025512,0.006078,0.109718
Systolic,0.000228,0.053563,1.0,0.918053,0.015303,0.302769
Diastolic,0.000196,0.025512,0.918053,1.0,0.0123,0.291115
Sugar,0.000264,0.006078,0.015303,0.0123,1.0,0.302975
Alert,0.000966,0.109718,0.302769,0.291115,0.302975,1.0


In [None]:
data.columns.values.tolist()

['Age', 'HeartBeat', 'Systolic', 'Diastolic', 'Sugar', 'Alert']

In [None]:
data.head()

Unnamed: 0,Age,HeartBeat,Systolic,Diastolic,Sugar,Alert
0,18,49,90,60,80,0
1,18,49,90,60,100,0
2,18,49,90,60,120,0
3,18,49,90,60,140,0
4,18,49,90,60,160,0


In [None]:
#Spliting into features and result
X=data[["Age","HeartBeat","Systolic","Diastolic","Sugar"]]
y=data['Alert']

In [None]:
#Spliting the dataset into testing and training data set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 30)

### **KNN**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [None]:
grid_params = { 'n_neighbors' : np.arange(1,100),
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
#Grid Search will evaluate the model 99(1-100) x 2(weights) x 3 (metric) x 3(folds) = 1782 times with different hyperparameters.
knn=KNeighborsClassifier()
gs = GridSearchCV(knn, grid_params, verbose = 1, cv=3, n_jobs = -1)
# fit the model on our train set
g_res = gs.fit(X_train, y_train)

Fitting 3 folds for each of 594 candidates, totalling 1782 fits


We will use three hyperparamters- n-neighbors, weights and metric.

n_neighbors: Decide the best k based on the values we have computed earlier.

weights: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.

metric: The distance metric to be used will calculating the similarity.

In [None]:
# find the best score.......Mean cross-validated score of the best_estimator
g_res.best_score_

0.9949480396228464

In [None]:
# get the hyperparameters with the best score
g_res.best_params_

{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = g_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)  #how close a measurement is to the true or accepted value
precision = precision_score(y_test, y_pred)  #how close measurements of the same item are to each other
recall = recall_score(y_test, y_pred)  #completeness of positive predictions
f1 = f1_score(y_test, y_pred)  #Precision + recall

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9950606196677144
Precision: 0.9968966340415374
Recall: 0.9978494623655914
F1 score: 0.9973728206352998


### **LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
grid_params = { 'penalty' : ['l1'],
                'solver': ['liblinear','saga'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log=LogisticRegression()
g_l1=GridSearchCV(log,grid_params,verbose=1,cv=3,n_jobs=-1)
g_l1_res=g_l1.fit(X_train, y_train)


Fitting 3 folds for each of 14 candidates, totalling 42 fits


In [None]:
g_l1_res.best_score_

0.9630086750947421

In [None]:
g_l1_res.best_params_

{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = g_l1_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.958688819039066
Precision: 0.9708166627441751
Recall: 0.985663082437276
F1 score: 0.9781835428029405


In [None]:
from sklearn.linear_model import LogisticRegression
grid_params = { 'penalty' : ['l2'],
                'solver': ['liblinear','saga','lbfgs','newton-cg','newton-cholesky','sag'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log=LogisticRegression()
g_l2=GridSearchCV(log,grid_params,verbose=1,cv=3,n_jobs=-1)
g_l2_res=g_l2.fit(X_train, y_train)


Fitting 3 folds for each of 42 candidates, totalling 126 fits


In [None]:
g_l2_res.best_score_

0.9632331986853432

In [None]:
g_l2_res.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = g_l2_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9591378536147284
Precision: 0.9712738403578997
Recall: 0.985663082437276
F1 score: 0.9784155597722959


In [None]:
#l1_ratiofloat, default=None
#The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'.
#Setting l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'.
# For 0 < l1_ratio <1, the penalty is a combination of L1 and L2.

from sklearn.linear_model import LogisticRegression
grid_params = { 'penalty' : ['elasticnet'],
                'solver': ['saga'],
               'l1_ratio':np.arange(0.1,1,.1),
               'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log=LogisticRegression()
g_el=GridSearchCV(log,grid_params,verbose=1, cv=3 ,n_jobs=-1,error_score='raise')
g_el_res=g_el.fit(X_train, y_train)

Fitting 3 folds for each of 63 candidates, totalling 189 fits




In [None]:
g_el_res.best_score_

0.9361773404032272

In [None]:
g_el_res.best_params_

{'C': 0.001, 'l1_ratio': 0.9, 'penalty': 'elasticnet', 'solver': 'saga'}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = g_el_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9391558149977548
Precision: 0.9407657657657658
Recall: 0.9980884109916368
F1 score: 0.9685797101449276


### **DECISION TREE**

In [None]:
# from pandas.core.arrays.datetimes import DT64NS_DTYPE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt


dt= DecisionTreeClassifier(random_state=30)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_leaf_nodes': [None, 5, 10, 15, 20],
}

g_dt = GridSearchCV(dt,param_grid, verbose=1, cv=3, n_jobs=-1)
g_dt_res = g_dt.fit(X_train, y_train)
# plt.figure(figsize=(20,10))
# plot_tree(gd)
# plt.show()

Fitting 3 folds for each of 2500 candidates, totalling 7500 fits


In [None]:
g_dt_res.best_score_

0.9986528395523268

In [None]:
g_dt_res.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'random'}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = g_dt_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)


Accuracy: 0.9975303098338572
Precision: 0.9985666507405638
Recall: 0.998805256869773
F1 score: 0.9986859395532194


In [None]:
from sklearn.metrics import confusion_matrix

# After predicting on the test data
cm = confusion_matrix(y_test, y_pred)

print("Confusion matrix:")
print(cm)


Confusion matrix:
[[ 263    6]
 [   5 4180]]


### **RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


param_grid = {
    'criterion': ['gini', 'entropy'],
    'n_estimators':[50,100,200],
    'max_depth': [6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    #'max_leaf_nodes': [None, 5, 10, 15, 20],
}

rf=RandomForestClassifier()
gr = GridSearchCV(rf,param_grid, verbose=1, cv=3, n_jobs=-1)
gr_res = gr.fit(X_train, y_train)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


In [None]:
gr_res.best_score_

0.998596694476627

In [None]:
gr_res.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 100}

In [None]:
# After hyperparameter tuning and model training

# Predict on the test data
y_pred = gr_res.best_estimator_.predict(X_test)

# Evaluate the model performance on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)


Accuracy: 0.9975303098338572
Precision: 0.9983285577841452
Recall: 0.9990442054958184
F1 score: 0.9986862534336558
