In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
146,9,57,80,37,0,32.8,0.096,41,0
300,0,167,0,0,0,32.3,0.839,30,1
361,5,158,70,0,0,29.8,0.207,63,0
117,5,78,48,0,0,33.7,0.654,25,0
41,7,133,84,0,0,40.2,0.696,37,0


In [4]:
df.info

<bound method DataFrame.info of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   5

In [5]:
df.count()

Pregnancies                 768
Glucose                     768
BloodPressure               768
SkinThickness               768
Insulin                     768
BMI                         768
DiabetesPedigreeFunction    768
Age                         768
Outcome                     768
dtype: int64

In [6]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
x = df.drop("Outcome", axis = 1)
y = df["Outcome"]

In [8]:
x.sample()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
344,8,95,72,0,0,36.8,0.485,57


In [9]:
y.sample()

353    0
Name: Outcome, dtype: int64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state = None)

In [11]:
scaler = StandardScaler()

In [12]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
model = LogisticRegression()

In [14]:
model.fit(x_train_scaled, y_train)

In [15]:
y_pred = model.predict(x_test_scaled)

In [16]:
accuracy = accuracy_score(y_pred, y_test)

In [17]:
print(f"accuracy:{accuracy:.2f}")

accuracy:0.71


In [None]:
joblib.dump(model, ".pkl")

<h1>Training Without Scaling</h1>

In [18]:
model = LogisticRegression(max_iter=2500)
model.fit(x_train, y_train)

In [19]:
y_pred = model.predict(x_test)

In [20]:
accuracy = accuracy_score(y_pred, y_test)

In [21]:
print(f"accuracy:{accuracy:.2f}")

accuracy:0.72
