In [11]:
# Core Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - Preprocessing, Models, Metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

# Deployment Preparation
import pickle # Used to save the model and scaler

# Set visualization style and ignore warnings for clean output
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Define the filename (it's uploaded to Colab)
file_path = 'diabetes.csv'

# Load the dataset
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please upload the file or correct the path.")
    # Exit or create a dummy DataFrame if the file is essential for the next steps
    # return

# Display the first few rows (Data Understanding - initial look)
print("\n--- First 5 Rows of the Dataset ---")
print(df.head())

# Print shape and data types (Data Understanding - structure)
print("\n--- Dataset Info (Shape and Data Types) ---")
df.info()

# Print the shape of the dataset
print(f"\nDataset Shape: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded successfully!

--- First 5 Rows of the Dataset ---
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

--- Dataset Info (Shape and Data Types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --