In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

In [None]:
df.shape

In [None]:
df.info()    

In [None]:
df.drop(['diabetes_risk_score','diabetes_stage'],inplace=True,axis=1)

In [None]:
df.describe()

In [None]:
import pandas as pd

# Example: assuming your dataset is called df
# df = pd.read_csv('your_dataset.csv')  # if you have a CSV file

# List of categorical columns to one-hot encode
categorical_cols = [
    'gender', 'ethnicity', 'education_level',
    'income_level', 'employment_status',
    'smoking_status'
]

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# View first 5 rows
print(df_encoded.head())


In [None]:
import pandas as pd

def iqr_capping(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap the values
        df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x))
    return df

numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

df_capped = iqr_capping(df_encoded, numerical_cols)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Target and features
y = df_capped['diagnosed_diabetes']
X = df_capped.drop(columns=['diagnosed_diabetes'])

# Combine X and y to perform undersampling
df_full = X.copy()
df_full['diagnosed_diabetes'] = y

# Find minority class count
min_count = df_full['diagnosed_diabetes'].value_counts().min()

# Perform manual undersampling for all classes
balanced_df = df_full.groupby('diagnosed_diabetes').sample(n=min_count, random_state=42)

# Separate features and target again
X_balanced = balanced_df.drop(columns=['diagnosed_diabetes'])
y_balanced = balanced_df['diagnosed_diabetes']

# Now split into train and test (stratify ensures balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Scale numerical columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X_train[num_cols] = pd.DataFrame(
    scaler.fit_transform(X_train[num_cols]), columns=num_cols, index=X_train.index
)
X_test[num_cols] = pd.DataFrame(
    scaler.transform(X_test[num_cols]), columns=num_cols, index=X_test.index
)

# Verify class distribution
print(y_train.value_counts())
print(y_test.value_counts())


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        bootstrap=True,
        max_samples=0.8,
        max_features=0.7,
    ),

    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        random_state=42,
    )
}


In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)  # use X_train and y_train from pre-split undersampling
    y_pred = model.predict(X_test)
    print(f"---{name}---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Make a copy of the original dataframe
df_encoded1 = df.copy()

categorical_cols = [
    'gender', 'ethnicity', 'education_level',
    'income_level', 'employment_status',
    'smoking_status'
]

le = LabelEncoder()

for col in categorical_cols:
    if col in df_encoded1.columns:  # check if column exists
        df_encoded1[col] = le.fit_transform(df_encoded1[col])


In [None]:
df_encoded1.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select numerical columns
num_cols = df_encoded1.select_dtypes(include=['int64', 'float64']).columns

# Plot distributions using a loop
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df_encoded1[col], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
df_encoded1.info()

In [None]:
df_encoded1.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations, cycle

# Select numerical columns
num_cols = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history',
    'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'glucose_fasting', 'glucose_postprandial', 'insulin_level', 'hba1c'
]

# Optional: sample for faster plotting
sample_data = df_encoded1[num_cols].sample(1000, random_state=42)



# Loop through all pairs
for col_x, col_y in combinations(num_cols, 2):
    plt.figure(figsize=(6,4))
    sns.scatterplot(
        x=sample_data[col_x],
        y=sample_data[col_y],
        alpha=0.6,
    )
    plt.xlabel(col_x)
    plt.ylabel(col_y)
    plt.title(f'Scatter plot: {col_x} vs {col_y}')
    plt.show()


In [None]:
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df_encoded1[col], color='lightgreen')
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:
for i in df_encoded1.columns:
    print(i,'---',df_encoded1[i].unique())



In [None]:

def cap_outliers_iqr(df, cols):

    df_capped = df.copy()
    
    for col in cols:
        if col in df_capped.columns:
            Q1 = df_capped[col].quantile(0.25)
            Q3 = df_capped[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_limit = Q1 - 1.5 * IQR
            upper_limit = Q3 + 1.5 * IQR
            
            # Cap the values
            df_capped[col] = df_capped[col].clip(lower=lower_limit, upper=upper_limit)
    
    return df_capped


In [None]:
num_cols = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'hba1c', 'insulin_level', 'glucose_postprandial', 'glucose_fasting', 'triglycerides', 'cholesterol_total', 'ldl_cholesterol', 'hdl_cholesterol', 'heart_rate', 'diastolic_bp', 'systolic_bp', 'waist_to_hip_ratio', 'bmi']
df_capped = cap_outliers_iqr(df_encoded1, num_cols)

import matplotlib.pyplot as plt
import seaborn as sns

for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df_capped[col], color='lightgreen')
    plt.title(f'Boxplot of {col} after capping')
    plt.show()


In [None]:
df_capped.info()

In [None]:
df_capped.describe()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:

y = df_encoded1['diagnosed_diabetes']
X = df_encoded1.drop(columns=['diagnosed_diabetes'])


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [None]:

rf_model = RandomForestClassifier(random_state=42,class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [None]:

lr_model = LogisticRegression(max_iter=1000, random_state=42,class_weight='balanced')
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [None]:

print("Random Forest WITHOUT Standardization")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



In [None]:

print("\nLogistic Regression WITHOUT Standardization")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

In [None]:
from sklearn.preprocessing import StandardScaler

def get_continuous_columns(df):

    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    continuous_cols = [col for col in num_cols if df[col].nunique() > 2]
    return continuous_cols



In [None]:

continuous_cols = get_continuous_columns(X_train)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


In [None]:

X_train_scaled[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
X_test_scaled[continuous_cols] = scaler.transform(X_test[continuous_cols])

In [None]:


rf_model_scaled = RandomForestClassifier(random_state=42,class_weight='balanced')
rf_model_scaled.fit(X_train_scaled, y_train)
y_pred_rf_scaled = rf_model_scaled.predict(X_test_scaled)



In [None]:

lr_model_scaled = LogisticRegression(max_iter=1000, random_state=42,class_weight='balanced')
lr_model_scaled.fit(X_train_scaled, y_train)
y_pred_lr_scaled = lr_model_scaled.predict(X_test_scaled)


In [None]:

print("Random Forest WITH Standardization")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_scaled))
print("Classification Report:\n", classification_report(y_test, y_pred_rf_scaled))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_scaled))


In [None]:

print("\nLogistic Regression WITH Standardization")
print("Accuracy:", accuracy_score(y_test, y_pred_lr_scaled))
print("Classification Report:\n", classification_report(y_test, y_pred_lr_scaled))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_scaled))

In [None]:

rf_model_bootstrap = RandomForestClassifier(
    n_estimators=10,
    random_state=4,
    bootstrap=True,       # sample rows
    max_samples=0.8,      # fraction of rows
    max_features=0.7   ,   # fraction of columns
    class_weight='balanced'
)


In [None]:

rf_model_bootstrap.fit(X_train_scaled, y_train)
y_pred_rf_bootstrap = rf_model_bootstrap.predict(X_test_scaled)


In [None]:

print("Random Forest WITH Bootstrap & Column Sampling")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_bootstrap))
print("Classification Report:\n", classification_report(y_test, y_pred_rf_bootstrap))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_bootstrap))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_model = RandomForestClassifier(
    random_state=4,
    bootstrap=True,
    max_samples=0.8,
    max_features=0.7,
    class_weight='balanced'
)


In [97]:
param_grid = {
    'n_estimators': [10,15],        # number of trees
    'max_depth': [5,10,15],       # depth of trees
    'min_samples_split': [2,3,4],       # minimum samples to split node
    'min_samples_leaf': [1, 2]          # minimum samples per leaf
}


In [98]:

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',       
    verbose=2
)


In [99]:

# Fit Grid Search on training data
grid_search.fit(X_train,y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   2.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=15; total time=   3.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=15; total time=   3.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=15; total time=   3.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=15; total time=   3.6s
[CV] END max_depth=5, min_samples_leaf=1, min_s

0,1,2
,estimator,RandomForestC...andom_state=4)
,param_grid,"{'max_depth': [5, 10, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 3, ...], 'n_estimators': [10, 15]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.7
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [100]:


print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}


In [101]:
best_rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.7
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [103]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

y_pred = best_rf_model.predict(X_test)


In [104]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.92
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91      8000
           1       1.00      0.87      0.93     12000

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

Confusion Matrix:
 [[ 8000     0]
 [ 1600 10400]]


In [105]:
import pickle

with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)


