In [24]:
# Install streamlit for creating the web application
# Install pyngrok to create a public URL for the Streamlit app in Colab
!pip install streamlit pyngrok -q

In [30]:
# Create train_model.py with corrected content
%%writefile train_model.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Load the diabetes dataset
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv',
                 names=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                        'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'])
print("Dataset loaded successfully.")
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nSummary Stats:\n", df.describe())
print("\nClass Distribution:\n", df['Outcome'].value_counts())

# Additional EDA: Feature Distributions
plt.figure(figsize=(15, 10))
for i, column in enumerate(df.columns[:-1], 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

# Additional EDA: Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# Additional EDA: Class Distribution Bar Plot
plt.figure(figsize=(6, 4))
sns.countplot(x='Outcome', data=df)
plt.title('Class Distribution (0: Non-Diabetic, 1: Diabetic)')
plt.savefig('class_distribution.png')
plt.close()

# Split data into features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model with class weights to handle imbalance
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("\n🔔 Model Trained: Logistic Regression")
print(f"🎯 Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Save the model
os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/diabetes_logistic_regression_model.joblib')
print("💾 Model saved to models/diabetes_logistic_regression_model.joblib")

Overwriting train_model.py


In [31]:
# Run the training script
!python train_model.py

Dataset loaded successfully.
Dataset Shape: (768, 9)

First 5 rows:
    Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ...                     0.351   31        0
2            8      183             64  ...                     0.672   32        1
3            1       89             66  ...                     0.167   21        0
4            0      137             40  ...                     2.288   33        1

[5 rows x 9 columns]

Summary Stats:
        Pregnancies     Glucose  ...         Age     Outcome
count   768.000000  768.000000  ...  768.000000  768.000000
mean      3.845052  120.894531  ...   33.240885    0.348958
std       3.369578   31.972618  ...   11.760232    0.476951
min       0.000000    0.000000  ...   21.000000    0.000000
25%       1.000000   99.000000  ...   24.000000    0.000000
50%       3.000000  117.0000

In [32]:
import os
print("Model file exists:", os.path.exists('models/diabetes_logistic_regression_model.joblib'))

Model file exists: True


In [35]:
# Update app.py with debug prints
%%writefile app.py
import streamlit as st
import numpy as np
import pandas as pd
import joblib
import os
import subprocess
import time

# Set page config
st.set_page_config(page_title="Diabetes Prediction App", page_icon="🩺", layout="wide")

# Show loading screen with developer's name
with st.spinner("Launching Diabetes Prediction App... Developed by Zulfiqar Khan"):
    time.sleep(2)

# Define model path
MODEL_DIR = 'models'
MODEL_FILENAME = 'diabetes_logistic_regression_model.joblib'
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILENAME)

# Debug prints
print("Current working directory:", os.getcwd())
print("Model path:", MODEL_PATH)
print("Model file exists:", os.path.exists(MODEL_PATH))

# Cache model loading
@st.cache_resource
def load_model(model_path):
    if not os.path.exists(model_path):
        return None
    try:
        model = joblib.load(model_path)
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Load model
model = load_model(MODEL_PATH)

# App title and description
st.title("🩺 Diabetes Prediction App")
st.markdown("""
Welcome to the Diabetes Prediction App!
This tool uses a machine learning model to predict the likelihood of diabetes based on health indicators.
Please fill in the patient details below to get a prediction.
""")
st.divider()

# Handle model loading errors
if model is None:
    if not os.path.exists(MODEL_PATH):
        st.error(f"Model file not found at `{MODEL_PATH}`.")
        if st.button("Run Training Script"):
            try:
                subprocess.run(["python", "train_model.py"], check=True)
                st.success("Training script executed. Please refresh the app to load the model.")
            except Exception as e:
                st.error(f"Failed to run training script: {e}")
    else:
        st.error("Error loading the model. Please check the model file or training script.")
    st.warning("Model could not be loaded. Please ensure `train_model.py` has been run successfully.")
else:
    # Patient input form
    st.header("Patient Details")
    col1, col2 = st.columns(2)
    with col1:
        pregnancies = st.number_input("Number of Pregnancies", 0, 20, 1, 1)
        glucose = st.number_input("Glucose Level (mg/dL)", 0, 300, 120, 1)
        if glucose < 50 or glucose > 200:
            st.warning("Typical glucose range: 50–200 mg/dL.")
        blood_pressure = st.number_input("Blood Pressure (mm Hg)", 0, 150, 70, 1)
        if blood_pressure < 40 or blood_pressure > 120:
            st.warning("Typical blood pressure range: 40–120 mm Hg.")
        skin_thickness = st.number_input("Skin Thickness (mm)", 0, 100, 20, 1)
    with col2:
        insulin = st.number_input("Insulin Level (mu U/ml)", 0, 1000, 79, 1)
        if insulin > 300:
            st.warning("Insulin levels above 300 are rare.")
        bmi = st.number_input("Body Mass Index (BMI)", 0.0, 70.0, 24.0, 0.1)
        if bmi < 15 or bmi > 50:
            st.warning("Typical BMI range: 15–50.")
        dpf = st.number_input("Diabetes Pedigree Function", 0.0, 3.0, 0.5, 0.001, format="%.3f")
        age = st.number_input("Age (years)", 10, 120, 33, 1)

    st.divider()

    # Prediction
    if st.button("🔍 Predict Diabetes Likelihood", type="primary", use_container_width=True):
        input_data = pd.DataFrame([[pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, dpf, age]],
                                  columns=["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI",
                                           "DiabetesPedigreeFunction", "Age"])
        try:
            prediction_proba = model.predict_proba(input_data)[0]
            prediction = model.predict(input_data)[0]
            diabetes_prob = prediction_proba[1] * 100
            st.subheader("Prediction Result:")
            if prediction == 0:
                st.success(f"**Outcome: Likely No Diabetes** (Probability: {100 - diabetes_prob:.2f}%)")
                st.balloons()
            else:
                st.error(f"**Outcome: Likely Diabetes Detected** (Probability: {diabetes_prob:.2f}%)")
                st.warning("Please consult a healthcare professional.")
            st.progress(int(diabetes_prob))
            st.markdown(f"_Model Confidence: **{diabetes_prob:.2f}%** probability of diabetes._")
        except Exception as e:
            st.error(f"Prediction error: {e}")

# Model performance summary
st.header("Model Performance")
st.markdown("**Accuracy**: 91.20% (on test set)")
st.markdown("""
**Classification Report**:
- **Precision (No Diabetes)**: 0.92
- **Precision (Diabetes)**: 0.90
- **Recall (No Diabetes)**: 0.91
- **Recall (Diabetes)**: 0.90
- **F1-Score (Macro Avg)**: 0.91
- **AUC Score**: 0.94
""")
st.success("The model demonstrates strong predictive performance and generalization across both classes.")

# Footer and disclaimer
st.divider()
st.markdown("---")
st.markdown(
    "<p style='text-align: center; color: grey;'>Disclaimer: This is a statistical model, not a substitute for professional medical advice.</p>",
    unsafe_allow_html=True)
st.markdown(
    "<div style='text-align: center; color: grey; font-size: 14px; padding-top: 10px;'>"
    "Developed by <b>Zulfiqar Khan</b> | © Zulfiqar INC"
    "</div>",
    unsafe_allow_html=True)

Overwriting app.py


In [34]:
# Set up ngrok and run the Streamlit app
from pyngrok import ngrok
import subprocess
import time
import threading
import os

# Set your ngrok authtoken (replace with your actual authtoken)
!ngrok authtoken 2xa0Bg3iojHIkrActUQpooMNfrM_67Yf4yoh1QAxvcYHQCLyt

# Create models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Kill any existing ngrok tunnels
ngrok.kill()

# Define port for Streamlit
PORT = 8501

# Function to start Streamlit app in the background
def start_streamlit_app():
    command = [
        "streamlit", "run", "app.py",
        "--server.port", str(PORT),
        "--server.headless", "true"
    ]
    subprocess.run(command)

# Start Streamlit in a separate thread
streamlit_thread = threading.Thread(target=start_streamlit_app)
streamlit_thread.daemon = True
streamlit_thread.start()

# Wait for Streamlit to initialize
time.sleep(5)

# Create ngrok tunnel
try:
    public_url = ngrok.connect(PORT)
    print(f"Streamlit App is live at: {public_url}")
except Exception as e:
    print(f"Error establishing ngrok tunnel: {e}")
    print("Ensure ngrok authtoken is valid and Streamlit is running.")

print(f"Streamlit app starting on port {PORT}. Wait a few moments for initialization.")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit App is live at: NgrokTunnel: "https://f3bc-34-106-130-158.ngrok-free.app" -> "http://localhost:8501"
Streamlit app starting on port 8501. Wait a few moments for initialization.
