In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn xgboost




In [None]:
import os
print(os.getcwd())


/content


In [None]:
from google.colab import files
uploaded = files.upload()


In [1]:
import pandas as pd
data = pd.read_csv('diabetes dataset.csv')   # put the downloaded file in your working folder
data.head()
data.info()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
#exploratory data analysis
import seaborn as sns
import matplotlib.pyplot as plt

# Class balance
print(data['Outcome'].value_counts(normalize=True))

# Histograms
data.hist(figsize=(12,10))
plt.tight_layout()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.show()


In [None]:
#Data preprocessing — missing values & invalid zeros
import numpy as np

cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols_with_zero:
    data[col] = data[col].replace(0, np.nan)

# Show missing counts
print(data.isnull().sum())


In [None]:
#Imputation options: mean/median or KNN imputer. For simplicity use median (robust to outliers).
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
data[cols_with_zero] = imputer.fit_transform(data[cols_with_zero])


In [None]:
#Train/test split
from sklearn.model_selection import train_test_split

X = data.drop('Outcome', axis=1)
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42)


In [None]:
#Baseline model — Logistic Regression (with pipeline)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
y_prob = pipe_lr.predict_proba(X_test)[:,1]


In [None]:
#Evaluation metrics (accuracy, precision, recall, f1, ROC-AUC)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plot ROC
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


In [None]:
#Try SVM (alternative model)
from sklearn.svm import SVC
pipe_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])
pipe_svc.fit(X_train, y_train)
y_pred_svc = pipe_svc.predict(X_test)
y_prob_svc = pipe_svc.predict_proba(X_test)[:,1]

print("SVM ROC-AUC:", roc_auc_score(y_test, y_prob_svc))


In [None]:
#Cross-validation & model comparison
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in [('Logistic Regression', pipe_lr), ('SVM', pipe_svc)]:
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv)
    print(name, "ROC-AUC mean ± std:", scores.mean(), scores.std())


In [None]:
#Hyperparameter tuning (GridSearchCV)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs']
}
gs = GridSearchCV(pipe_lr, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
print("Best CV ROC-AUC:", gs.best_score_)


In [None]:
#Feature selection / importance
#Logistic regression coefficients:
coef = gs.best_estimator_.named_steps['clf'].coef_[0]  # if you used gs
feature_importance = pd.Series(abs(coef), index=X.columns).sort_values(ascending=False)
print(feature_importance)
feature_importance.plot(kind='bar')
plt.title("Feature importance (Logistic Regression coefficients)")
plt.show()


In [None]:
#Random Forest importances:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).plot(kind='bar')
plt.title("Feature importance (Random Forest)")
plt.show()


In [None]:
#RFE example:
from sklearn.feature_selection import RFE
selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=5)
selector.fit(X_train, y_train)
selected_features = X.columns[selector.support_]
print("Top features by RFE:", list(selected_features))


In [None]:
#Dealing with class imbalance (if present)
LogisticRegression(class_weight='balanced')


In [None]:
#Final evaluation & save model
import joblib
best_model = gs.best_estimator_  # or any final pipeline
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'diabetes_model.pkl')


In [None]:
import joblib

# assuming your best model is stored in variable 'best_model'
joblib.dump(best_model, 'diabetes_model.pkl')


In [None]:
from google.colab import files
files.download('diabetes_model.pkl')
