Importing the Dependencies

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import pickle

In [2]:
# Load the diabetes dataset
diabetes_dataset = pd.read_csv('C:\\Users\\joelp\\OneDrive\\Desktop\\diabetes_streamlit\\diabetes.csv')

In [3]:
# Display the first 5 rows of the dataset
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# Get the number of rows and columns in the dataset
print("Shape of the dataset:", diabetes_dataset.shape)

Shape of the dataset: (768, 9)


In [5]:
# Display statistical measures of the data
print("Statistical measures of the data:")
print(diabetes_dataset.describe())

Statistical measures of the data:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
m

In [6]:
# Display the count of outcomes
print("Outcome counts:")
print(diabetes_dataset['Outcome'].value_counts())

Outcome counts:
0    500
1    268
Name: Outcome, dtype: int64


0 --> Non-Diabetic

1 --> Diabetic

In [7]:
# Display mean values by outcome
print("Mean values grouped by Outcome:")
print(diabetes_dataset.groupby('Outcome').mean())

Mean values grouped by Outcome:
         Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
Outcome                                                                      
0           3.298000  109.980000      68.184000      19.664000   68.792000   
1           4.865672  141.257463      70.824627      22.164179  100.335821   

               BMI  DiabetesPedigreeFunction        Age  
Outcome                                                  
0        30.304200                  0.429734  31.190000  
1        35.142537                  0.550500  37.067164  


In [8]:
# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [9]:
# Display input features and labels
print("Input features:")
print(X.head())
print("\nLabels:")
print(Y.head())

Input features:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  

Labels:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [10]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [11]:
# Initialize the linear regression model
regressor = LinearRegression()

In [12]:
# Train the linear regression model
regressor.fit(X_train, Y_train)


In [13]:
# Evaluate the model on the training data
training_data_accuracy = regressor.score(X_train, Y_train)
print('Accuracy score on the training data:', training_data_accuracy)

Accuracy score on the training data: 0.30506972801106247


In [14]:
# Evaluate the model on the test data
test_data_accuracy = regressor.score(X_test, Y_test)
print('Accuracy score on the test data:', test_data_accuracy)

Accuracy score on the test data: 0.25500281176741757


In [16]:
# Save the trained linear regression model in pickle format
filename = 'trained_model_linear_regression.pkl'
with open(filename, 'wb') as file:
    pickle.dump(regressor, file)


In [18]:
# Load the saved model
with open('trained_model_linear_regression.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [19]:
# Make predictions on new data
input_data = np.array([5, 166, 72, 19, 175, 25.8, 0.587, 51]).reshape(1, -1)
prediction = loaded_model.predict(input_data)




In [20]:
if prediction[0] <= 0.5:
    print('The person is not diabetic')
else:
    print('The person is diabetic')

The person is diabetic
