In [1]:
#tryme

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from tkinter import *
import joblib

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Impute missing values in certain columns
data_copy = data.copy(deep=True)
data_copy[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data_copy[
    ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
].replace(0, np.nan)

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data_copy), columns=data_copy.columns)

# Create synthetic 'DiabetesType' column based on 'Outcome'
def assign_diabetes_type(row):
    if row['Outcome'] == 0:
        return 'None'
    else:
        r = np.random.rand()
        if r < 0.7:
            return 'Type 2'
        elif r < 0.9:
            return 'Type 1'
        else:
            return 'Gestational'

data_imputed['DiabetesType'] = data_imputed.apply(assign_diabetes_type, axis=1)

# Split the data into features (X) and target variable (y)
X = data_imputed.drop(['Outcome', 'DiabetesType'], axis=1)
y = data_imputed['DiabetesType']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define pipelines for different classifiers
pipelines = {
    'LR': Pipeline([('scalar', StandardScaler()), ('lr_classifier', LogisticRegression())]),
    'KNN': Pipeline([('scalar', StandardScaler()), ('knn_classifier', KNeighborsClassifier())]),
    'SVC': Pipeline([('scalar', StandardScaler()), ('svc_classifier', SVC())]),
    'DT': Pipeline([('dt_classifier', DecisionTreeClassifier())]),
    'RF': Pipeline([('imputer', SimpleImputer(strategy='mean')), ('rf_classifier', RandomForestClassifier(max_depth=3))]),
    'GBC': Pipeline([('gbc_classifier', GradientBoostingClassifier())])
}

# Train and evaluate each classifier in the pipeline
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {accuracy * 100:.2f}%")

# Train RandomForestClassifier using the pipeline on the entire dataset
rf_pipeline = pipelines['RF']
rf_pipeline.fit(X, y)

# Save the RandomForestClassifier model
joblib.dump(rf_pipeline, 'model_joblib_diabetes')

# Define precautions for each diabetes type
precautions = {
    'Type 1': [
        "1. Take insulin as prescribed by your doctor.",
        "2. Monitor blood sugar levels regularly.",
        "3. Follow a healthy diet and exercise regularly.",
        "4. Attend regular check-ups with your healthcare provider.",
        "5. Stay informed about new diabetes management techniques."
    ],
    'Type 2': [
        "1. Maintain a healthy weight through diet and exercise.",
        "2. Take medications as prescribed by your doctor.",
        "3. Monitor blood sugar levels regularly.",
        "4. Follow a balanced diet rich in fiber and low in processed sugars.",
        "5. Attend regular check-ups with your healthcare provider."
    ],
    'Gestational': [
        "1. Monitor your blood sugar levels regularly.",
        "2. Follow a balanced diet as recommended by your healthcare provider.",
        "3. Engage in regular physical activity.",
        "4. Attend all prenatal appointments and follow your doctor's advice.",
        "5. Be aware of the potential for type 2 diabetes after pregnancy."
    ],
    'None': [
        "1. Maintain a healthy lifestyle to reduce the risk of developing diabetes.",
        "2. Eat a balanced diet rich in fiber and low in processed sugars.",
        "3. Engage in regular physical activity.",
        "4. Monitor your health with regular check-ups.",
        "5. Stay informed about diabetes prevention."
    ]
}

# GUI for prediction and displaying precautions
def show_entry_fields():
    inputs = [float(entry.get()) for entry in entries]
    model = joblib.load('model_joblib_diabetes')
    result = model.predict([inputs])[0]

    Label(master, text=f"Diabetes Type: {result}").grid(row=31)
    for i, precaution in enumerate(precautions[result]):
        Label(master, text=f"Precaution {i + 1}: {precaution}").grid(row=32 + i)

# GUI setup
master = Tk()
master.title("Diabetes Prediction Using Machine Learning")

label = Label(master, text="Diabetes Prediction Using Machine Learning", bg="black", fg="white").grid(row=0, columnspan=2)

# Labels and entry fields
labels = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
entries = [Entry(master) for _ in range(len(labels))]

for i, label in enumerate(labels):
    Label(master, text=label).grid(row=i + 1)
    entries[i].grid(row=i + 1, column=1)

# Button for prediction
Button(master, text='Predict', command=show_entry_fields).grid()

# Start the main loop
mainloop()


LR Test Accuracy: 73.38%
KNN Test Accuracy: 65.58%
SVC Test Accuracy: 71.43%
DT Test Accuracy: 62.99%
RF Test Accuracy: 73.38%
GBC Test Accuracy: 66.88%




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from tkinter import *
import joblib

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Impute missing values in certain columns
data_copy = data.copy(deep=True)
data_copy[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data_copy[
    ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
].replace(0, np.nan)

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data_copy), columns=data_copy.columns)

# Create synthetic 'DiabetesType' column based on 'Outcome' with a deterministic approach for higher glucose levels
def assign_diabetes_type(row):
    if row['Outcome'] == 0:
        return 'Non-Diabetic'
    else:
        if row['Glucose'] > 180:
            return 'Type 1'
        elif row['Glucose'] > 140:
            return 'Type 2'
        else:
            return 'Gestational'

data_imputed['DiabetesType'] = data_imputed.apply(assign_diabetes_type, axis=1)

# Split the data into features (X) and target variable (y)
X = data_imputed.drop(['Outcome', 'DiabetesType'], axis=1)
y = data_imputed['DiabetesType']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define pipelines for different classifiers
pipelines = {
    'LR': Pipeline([('scalar', StandardScaler()), ('lr_classifier', LogisticRegression())]),
    'KNN': Pipeline([('scalar', StandardScaler()), ('knn_classifier', KNeighborsClassifier())]),
    'SVC': Pipeline([('scalar', StandardScaler()), ('svc_classifier', SVC())]),
    'DT': Pipeline([('dt_classifier', DecisionTreeClassifier())]),
    'RF': Pipeline([('imputer', SimpleImputer(strategy='mean')), ('rf_classifier', RandomForestClassifier(max_depth=3))]),
    'GBC': Pipeline([('gbc_classifier', GradientBoostingClassifier())])
}

# Train and evaluate each classifier in the pipeline
best_accuracy = 0
best_pipeline_name = None

for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {accuracy * 100:.2f}%")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pipeline_name = name

# Save the best performing model
best_pipeline = pipelines[best_pipeline_name]
joblib.dump(best_pipeline, 'model_joblib_diabetes')

# Define precautions for each diabetes type
precautions = {
    'Type 1': [
        "1. Take insulin as prescribed by your doctor.",
        "2. Monitor blood sugar levels regularly.",
        "3. Follow a healthy diet and exercise regularly.",
        "4. Attend regular check-ups with your healthcare provider.",
        "5. Stay informed about new diabetes management techniques."
    ],
    'Type 2': [
        "1. Maintain a healthy weight through diet and exercise.",
        "2. Take medications as prescribed by your doctor.",
        "3. Monitor blood sugar levels regularly.",
        "4. Follow a balanced diet rich in fiber and low in processed sugars.",
        "5. Attend regular check-ups with your healthcare provider."
    ],
    'Gestational': [
        "1. Monitor your blood sugar levels regularly.",
        "2. Follow a balanced diet as recommended by your healthcare provider.",
        "3. Engage in regular physical activity.",
        "4. Attend all prenatal appointments and follow your doctor's advice.",
        "5. Be aware of the potential for type 2 diabetes after pregnancy."
    ],
    'Non-Diabetic': [
        "1. Maintain a healthy lifestyle to reduce the risk of developing diabetes.",
        "2. Eat a balanced diet rich in fiber and low in processed sugars.",
        "3. Engage in regular physical activity.",
        "4. Monitor your health with regular check-ups.",
        "5. Stay informed about diabetes prevention."
    ]
}

# GUI for prediction and displaying precautions
def show_entry_fields():
    inputs = [float(entry.get()) for entry in entries]
    model = joblib.load('model_joblib_diabetes')
    result = model.predict([inputs])[0]

    Label(master, text=f"Diabetes Type: {result}").grid(row=31)
    for i, precaution in enumerate(precautions[result]):
        Label(master, text=f"Precaution {i + 1}: {precaution}").grid(row=32 + i)

# GUI setup
master = Tk()
master.title("Diabetes Prediction Using Machine Learning")

label = Label(master, text="Diabetes Prediction Using Machine Learning", bg="black", fg="white").grid(row=0, columnspan=2)

# Labels and entry fields
labels = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
entries = [Entry(master) for _ in range(len(labels))]

for i, label in enumerate(labels):
    Label(master, text=label).grid(row=i + 1)
    entries[i].grid(row=i + 1, column=1)

# Button for prediction
Button(master, text='Predict', command=show_entry_fields).grid(row=len(labels) + 1, columnspan=2)

# Start the main loop
mainloop()


LR Test Accuracy: 71.43%
KNN Test Accuracy: 62.34%
SVC Test Accuracy: 72.08%
DT Test Accuracy: 66.88%
RF Test Accuracy: 70.78%
GBC Test Accuracy: 74.68%


