In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import joblib

# Diabetes Prediction

In [40]:
diabetes_prediction_dataset = pd.read_csv('diabetes_prediction_dataset.csv')
diabetes_prediction_dataset

Unnamed: 0,sex,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80,0,1,0,25.19,6.6,140,1
1,0,54,0,0,0,27.32,6.6,80,1
2,1,28,0,0,0,27.32,5.7,158,1
3,0,36,0,0,2,23.45,5.0,155,1
4,1,76,1,1,2,20.14,4.8,155,1
...,...,...,...,...,...,...,...,...,...
99995,0,80,0,0,4,27.32,6.2,90,0
99996,0,2,0,0,4,17.37,6.5,100,0
99997,1,66,0,0,1,27.83,5.7,155,0
99998,0,24,0,0,0,35.42,4.0,100,0


In [79]:
df = diabetes_prediction_dataset.copy()

def map_age_to_value(age):
    if age < 19:
        return 1
    elif 19 <= age <= 35:
        return 2
    elif 36 <= age <= 65:
        return 3
    else:
        return 4

def map_glucose_to_value(glucose):
    if glucose < 70:
        return 1
    elif 70 <= glucose <= 140:
        return 2
    else:
        return 3

def map_bmi_to_value(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 2
    elif 25 <= bmi <= 29.9:
        return 3
    else:
        return 4

def map_hba1c_to_value(hba1c):
    if hba1c < 5.7:
        return 1
    elif 5.7 <= hba1c <= 6.4:
        return 2
    else:
        return 3

def map_sex_to_value(sex):
    if sex == '0':
        return 1
    else:
        return 0

def map_plus_1(value):
    return value + 1

df['sex'] = df['sex'].apply(map_sex_to_value)
df['age'] = df['age'].apply(map_age_to_value)
df['blood_glucose_level'] = df['blood_glucose_level'].apply(map_glucose_to_value)
df['bmi'] = df['bmi'].apply(map_bmi_to_value)
df['HbA1c_level'] = df['HbA1c_level'].apply(map_hba1c_to_value)
df['hypertension'] = df['hypertension'].apply(map_plus_1)
df['heart_disease'] = df['heart_disease'].apply(map_plus_1)
df['smoking_history'] = df['smoking_history'].apply(map_plus_1)

df.tail(5)

Unnamed: 0,sex,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,0,4,1,1,5,3,2,2,0
99996,0,1,1,1,5,1,3,2,0
99997,0,4,1,1,2,3,2,3,0
99998,0,2,1,1,1,4,1,2,0
99999,0,3,1,1,3,2,3,2,0


In [80]:
df.head()

Unnamed: 0,sex,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,4,1,2,1,3,3,2,1
1,0,3,1,1,1,3,3,2,1
2,0,2,1,1,1,3,2,3,1
3,0,3,1,1,3,2,1,3,1
4,0,4,2,2,3,2,1,3,1


In [82]:
X = df.drop(columns=['diabetes'])
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

pickle.dump(model, open('diabetes_prediction_model.pkl', 'wb'))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 76.95%
              precision    recall  f1-score   support

           0       0.79      0.93      0.85     14636
           1       0.63      0.34      0.44      5364

    accuracy                           0.77     20000
   macro avg       0.71      0.63      0.65     20000
weighted avg       0.75      0.77      0.74     20000



# Hypertension Prediction

In [83]:
hypertension_data = pd.read_csv('hypertension_data.csv')
hypertension_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64,0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52,1,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26078,72,0,0,138,294,1,1,106,0,1.9,1,3,2,0
26079,60,1,0,144,200,0,0,126,1,0.9,1,0,3,0
26080,68,1,0,100,234,0,1,156,0,0.1,2,1,3,0
26081,67,1,1,154,232,0,0,164,0,0.0,2,1,2,0


In [86]:
df = hypertension_data.copy()

def map_trestbps_to_value(trestbps):
    if trestbps < 120:
        return 0
    elif 120 <= trestbps <= 129:
        return 1

    elif 130 <= trestbps <= 139:
        return 2

    else:
        return 3

def map_chol_to_value(chol):
    if chol < 200:
        return 0
    elif 200 <= chol <= 239:
        return 1
    else:
        return 2

def map_thalech_to_value(thalach):
    if thalach < 100:
        return 0
    elif 100 <= thalach <= 149:
        return 1
    else:
        return 2

def map_oldpeak_to_value(oldpeak):
    if oldpeak < 0.6:
        return 0
    elif 0.6 <= oldpeak <= 1.5:
        return 1
    else:
        return 2

df['age'] = df['age'].apply(map_age_to_value)
df['sex'] = df['sex'].apply(map_sex_to_value)
df['trestbps'] = df['trestbps'].apply(map_trestbps_to_value)
df['chol'] = df['chol'].apply(map_chol_to_value)
df['thalach'] = df['thalach'].apply(map_thalech_to_value)
df['oldpeak'] = df['oldpeak'].apply(map_oldpeak_to_value)
df['cp'] = df['cp'].apply(map_plus_1)
df['fbs'] = df['fbs'].apply(map_plus_1)
df['restecg'] = df['restecg'].apply(map_plus_1)
df['exang'] = df['exang'].apply(map_plus_1)
df['oldpeak'] = df['oldpeak'].apply(map_plus_1)
df['slope'] = df['slope'].apply(map_plus_1)
df['ca'] = df['ca'].apply(map_plus_1)

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,3,0,4,3,1,2,1,2,1,3,1,1,1,1
1,3,0,3,2,2,1,2,2,1,3,1,1,2,1
2,3,0,2,2,1,1,1,2,1,2,3,1,2,1
3,3,0,2,1,1,1,2,2,1,2,3,1,2,1
4,4,0,1,1,2,1,2,2,2,2,3,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26078,4,0,1,2,2,2,2,1,1,3,2,4,2,0
26079,3,0,1,3,1,1,1,1,2,2,2,1,3,0
26080,4,0,1,0,1,1,2,2,1,1,3,2,3,0
26081,4,0,2,3,1,1,1,2,1,1,3,2,2,0


In [88]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

pickle.dump(model, open('hypertension_prediction_model.pkl', 'wb'))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 99.33%
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2362
           1       1.00      0.99      0.99      2855

    accuracy                           0.99      5217
   macro avg       0.99      0.99      0.99      5217
weighted avg       0.99      0.99      0.99      5217



# Stroke Prediction

In [74]:
stroke_data = pd.read_csv('stroke_data.csv')
stroke_data

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,63,0,1,1,4,1,228.69,36.6,1,1
1,1,42,0,1,1,4,0,105.92,32.5,0,1
2,0,61,0,0,1,4,1,171.23,34.4,1,1
3,1,41,1,0,1,3,0,174.12,24.0,0,1
4,1,85,0,0,1,4,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
40905,1,38,0,0,0,4,1,120.94,29.7,1,0
40906,0,53,0,0,1,4,0,77.66,40.8,0,0
40907,1,32,0,0,1,2,0,231.95,33.2,0,0
40908,1,42,0,0,1,3,0,216.38,34.5,0,0


In [89]:
df = stroke_data.copy()

df['age'] = df['age'].apply(map_age_to_value)
df['sex'] = df['sex'].apply(map_sex_to_value)
df['avg_glucose_level'] = df['avg_glucose_level'].apply(map_glucose_to_value)
df['bmi'] = df['bmi'].apply(map_bmi_to_value)
df['hypertension'] = df['hypertension'].apply(map_plus_1)
df['heart_disease'] = df['heart_disease'].apply(map_plus_1)
df['ever_married'] = df['ever_married'].apply(map_plus_1)
df['work_type'] = df['work_type'].apply(map_plus_1)
df['Residence_type'] = df['Residence_type'].apply(map_plus_1)
df['smoking_status'] = df['smoking_status'].apply(map_plus_1)

df

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,3,1,2,2,5,2,3,4,2,1
1,0,3,1,2,2,5,1,2,4,1,1
2,0,3,1,1,2,5,2,3,4,2,1
3,0,3,2,1,2,4,1,3,2,1,1
4,0,4,1,1,2,5,2,3,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...
40905,0,3,1,1,1,5,2,2,3,2,0
40906,0,3,1,1,2,5,1,2,4,1,0
40907,0,2,1,1,2,3,1,3,4,1,0
40908,0,3,1,1,2,4,1,3,4,1,0


In [90]:
X = df.drop(columns=['stroke'])
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

pickle.dump(model, open('stroke_prediction_model.pkl', 'wb'))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 78.76%
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      4090
           1       0.77      0.82      0.79      4092

    accuracy                           0.79      8182
   macro avg       0.79      0.79      0.79      8182
weighted avg       0.79      0.79      0.79      8182

