# Importing Necessary Libraries

In [2]:
## Importing necessary library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# **1. Data Preprocessing**
Loading and Performing Initial Data Exploration

In [5]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/diabetes (1).csv')
print(df.head())
print(df.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  12

Handling Missing Data

In [6]:
df.dropna(inplace=True)
missing_cols = df.columns[df.isnull().any()].tolist()
# Display the columns with missing data
print("Columns with missing data:", missing_cols)

Columns with missing data: []


In [7]:
#   There is no missing column but still if there was missing columns we can use Simple Imputer
#   from sklearn.impute import SimpleImputer
#   imputer = SimpleImputer(strategy='mean')
#   data[missing_cols] = imputer.fit_transform(data[missing_cols])

Handling Outliers using z-score

In [9]:
from scipy import stats
z_scores = np.abs(stats.zscore(df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]))
df = df[(z_scores < 3).all(axis=1)]

Preparing the data for machine learning by encoding categorical variables

In [10]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
print(X)
print(y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
5              5      116             74              0        0  25.6   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


Splitting it into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **2. Feature Scaling**

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Model Building (Trying Different Model) and Model Optimization

In [14]:
## Logistic regressor Classifier
from sklearn.linear_model import LogisticRegression

# Train the model
lg_model = LogisticRegression()
lg_model.fit(X_train_scaled, y_train)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = lg_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print("ROC AUC: {:.2f}".format(roc_auc))

Accuracy: 0.78
Precision: 0.87
Recall: 0.49
F1 Score: 0.63
ROC AUC: 0.72


In [17]:
# Fine-tune the model parameters (hyperparameter tuning)
# Example: GridSearchCV for logistic regression
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)


from sklearn.metrics import accuracy_score


y_pred = best_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.77


In [25]:
## Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_rf = random_forest_model.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf

0.7101449275362319

In [26]:
from sklearn.svm import SVC

# Create and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_svm = svm_model.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(accuracy_svm)

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],            # Regularization parameter
    'kernel': ['linear', 'rbf'],      # Kernel type ('linear' or 'rbf')
    'gamma': ['scale', 'auto', 0.1, 1]  # Kernel coefficient for 'rbf' kernel
}
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy')
# Fit the model to the data
grid_search.fit(X_train_scaled, y_train)

from sklearn.metrics import accuracy_score


y_pred = best_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))

0.7681159420289855
Accuracy: 0.77


In [27]:
import joblib

# Save the trained SVC model to a file
joblib.dump(lg_model, 'svc_model.pkl')

['svc_model.pkl']