# Diabetes Prediction

## 1. Importing Libraries

In [None]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pickle

## 2. Loading and Exploring Data

In [None]:
df = pd.read_csv("/content/diabetes_prediction_dataset.csv")
print("First 5 rows of the dataset:")
print(df.head())

In [None]:
print("\nInformation about the dataset:")
print(df.info())

In [None]:
print("\nChecking for missing values:")
print(df.isnull().sum())

## 3. Data Preprocessing

In [None]:
# Encode categorical features with drop_first=True
df_encoded = pd.get_dummies(df, columns=["gender", "smoking_history"], drop_first=True)
print("\nDataset after one-hot encoding:")
print(df_encoded.head())

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df_encoded.corr(), annot=True, cmap="Purples", fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

## 4. Feature Selection and Data Splitting

In [None]:
# Define your exact 8 features
selected_features = [
    "gender_Male", "age", "hypertension", "heart_disease",
    "smoking_history_never", "bmi", "HbA1c_level", "blood_glucose_level"
]

In [None]:
# Make sure all selected features exist in the encoded dataframe
# If a selected feature column is missing, add it with a default value (e.g., 0)
for col in selected_features:
    if col not in df_encoded.columns:
        df_encoded[col] = 0

In [None]:
# Select features (X) and target (y)
X = df_encoded[selected_features]
y = df_encoded["diabetes"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

## 5. Model Training

In [None]:
# Train the Random Forest model
random_forest_model_selected = RandomForestClassifier(random_state=42)
random_forest_model_selected.fit(X_train, y_train)
print("✅ Random Forest model trained with the selected 8 features.")

In [None]:
print(f"Number of features used by the model: {random_forest_model_selected.n_features_in_}")

## 6. Saving the Trained Model and Feature List

In [None]:
# Specify the filename for the pickle file for the model
model_filename = 'diabetes_pred.pkl'

# Save the Random Forest model (trained with 8 features) to the pickle file
with open(model_filename, 'wb') as file:
    pickle.dump(random_forest_model_selected, file)

print(f"\nRandom Forest model with 8 features saved as {model_filename}")

# Specify the filename for the pickle file for the feature list
features_filename = 'diabetes_features.pkl'

# Save the list of selected features
with open(features_filename, 'wb') as f:
    pickle.dump(selected_features, f)

print(f"List of features used for training saved as {features_filename}")