In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
#Showing the database
df=pd.read_csv("/content/drive/MyDrive/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
#Replacing 0 with NaN
df[['Glucose','BloodPressure','SkinThickness', 'Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, np.nan)

In [14]:
#The total sum of isnull
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [15]:
#Filling the NULL values with mean
df.fillna(df.mean(), inplace=True)

In [16]:
#Checking if there is any NULL values
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [17]:
#Creating feature matrix and target
X_r = df.drop(columns=["BloodPressure"])
y_r = df["BloodPressure"]

In [19]:
#Splitting into train and test
X_train_r, X_test_r, y_train_reg, y_test_r = train_test_split(X_r, y_r, test_size=0.2, random_state=42)

In [20]:
#Standard scaling
scale = StandardScaler()
X_train_reg_scale = scale.fit_transform(X_train_r)
X_test_reg_scale = scale.transform(X_test_r)

In [21]:
#Calling linear Regression using sklearn
reg_model = LinearRegression()
reg_model.fit(X_train_reg_scale, y_train_reg)

In [22]:
#Predicting Blood Pressure
y_pred_r = reg_model.predict(X_test_reg_scale)

In [23]:
#Calculating Mean Squared Error
print("Mean Squared Error (MSE):", mean_squared_error(y_test_r, y_pred_r))

Mean Squared Error (MSE): 112.7205554795177


In [25]:
#Calculating the r squared
print("R-squared:", r2_score(y_test_r, y_pred_r))

R-squared: 0.2254825777184596


In [29]:
#creating tareget and feature matrix
X_cl = df.drop(columns=["Outcome"])
y_cl = df["Outcome"]

In [30]:
#Splitting into train and test
X_train_cl, X_test_cl, y_train_cl, y_test_cl = train_test_split(X_cl, y_cl, test_size=0.2, random_state=42)

In [32]:
#Scaling
X_train_clf_scale = scale.fit_transform(X_train_cl)
X_test_clf_scale = scale.transform(X_test_cl)

In [34]:
#Calling Logistic Regression using sklearn
clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train_clf_scale, y_train_cl)

In [36]:
#Predicting Outcome
y_pred_cl = clf_model.predict(X_test_clf_scale)

In [40]:
#Checking the accuracy
print("Accuracy:", accuracy_score(y_test_cl, y_pred_cl))

Accuracy: 0.7532467532467533


In [42]:
#Printing tge confusuion martix
print(confusion_matrix(y_test_cl, y_pred_cl))

[[82 17]
 [21 34]]


In [37]:
#Printing thr classification Report
print(classification_report(y_test_cl, y_pred_cl))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154

