In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, matthews_corrcoef

from sklearn.preprocessing import scale, StandardScaler

## Upload

In [2]:
diabetes = pd.read_csv('diabetes.csv',encoding='utf-8')

## Clean

### No nulls, but 0 entries could throw it off, make those 0s null if not in Pregnancies or Outcome?

In [3]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
diabetes.isna()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [5]:
# any zeros not in pregnancies, age, or outcome: turn to Nan
# double check if these can be zero or not!!!
diabetes['Glucose'].replace(0,np.NaN,inplace=True)
diabetes['BloodPressure'].replace(0,np.NaN,inplace=True)
diabetes['SkinThickness'].replace(0,np.NaN,inplace=True)
diabetes['Insulin'].replace(0,np.NaN,inplace=True)
diabetes['BMI'].replace(0,np.NaN,inplace=True)
diabetes['DiabetesPedigreeFunction'].replace(0,np.NaN,inplace=True)

In [25]:
diabetes.corr()

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.198291,-0.025347,0.007562,0.679608,0.256566
Glucose,0.198291,1.0,0.209516,0.14018,0.343641,0.515703
BMI,-0.025347,0.209516,1.0,0.158771,0.069814,0.270118
DiabetesPedigreeFunction,0.007562,0.14018,0.158771,1.0,0.085029,0.20933
Age,0.679608,0.343641,0.069814,0.085029,1.0,0.350804
Outcome,0.256566,0.515703,0.270118,0.20933,0.350804,1.0


In [6]:
diabetes.shape

(768, 9)

In [7]:
diabetes.dropna(axis=0,inplace=True)

In [8]:
for col in diabetes:
    print(f'{col} missing values: {diabetes[col].isnull().sum()}')

Pregnancies missing values: 0
Glucose missing values: 0
BloodPressure missing values: 0
SkinThickness missing values: 0
Insulin missing values: 0
BMI missing values: 0
DiabetesPedigreeFunction missing values: 0
Age missing values: 0
Outcome missing values: 0


In [9]:
# impute the mean for Nans in columns with Nans
# continuous_columns = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
# continuous_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# continuous_imputed_df = pd.DataFrame(continuous_imputer.fit_transform(diabetes[continuous_columns]), columns=continuous_columns)

In [10]:
# for col in continuous_columns:
#     diabetes[col] = continuous_imputed_df[col]

In [11]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1


In [12]:
diabetes.drop(columns=['BloodPressure','Insulin','SkinThickness'], inplace=True)
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89.0,28.1,0.167,21,0
4,0,137.0,43.1,2.288,33,1
6,3,78.0,31.0,0.248,26,1
8,2,197.0,30.5,0.158,53,1
13,1,189.0,30.1,0.398,59,1


## Split

In [13]:
y = diabetes['Outcome']
X = diabetes.drop(columns='Outcome')

In [14]:
# scaler = StandardScaler() #doesn't return pandas anymore
# scaled = scaler.fit_transform(diabetes)
# print(scaled)

In [15]:
# X , y = scale(diabetes.drop(columns=["Outcome"])), diabetes["Outcome"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80, random_state=0)

In [17]:
#scaler = scale(X_train) #doesn't return pandas anymore
# xscaled = scaler.fit(X_train) # figure out how to scale data
# xtrainscaled = xscaled.transform(X_train) # apply it to the data
# xtestscaled = xscaled.transform(X_test)
# #don't usually have to do it for y, b/c that's the goal
# print(xtrainscaled)


In [18]:
scaler = scaler = StandardScaler()

## same as fit_transform
#-------------------
scaleModel = scaler.fit(X_train)
X_train_scaled = scaleModel.tranform(X_train)
#---------------------------

X_test_scaled = scaler.transform(X_test)
# yscaler = scaler.fit_transform(y)
#X_scaler = scaler.fit(X_train)
#X_train_transformed = X_scaler.transform(X_train)


#X_train_scaled = scale(X_train)
#X_test_scaled = scale(X_test)


# X_test_scaled = scaler.fit(X_test)
#X_test_transformed = X_scaler.transform(X_test)

## Training

In [19]:
lr = LogisticRegression().fit(X_train_scaled, y_train)
print(lr.coef_) #slopes
print(lr.intercept_) #intercepts
print(lr.n_iter_)

[[0.06100285 1.13957396 0.4509802  0.34776172 0.44631099]]
[-0.93806354]
[10]


In [20]:
score = lr.score(X_train_scaled,y_train)
print(score)

0.7795527156549521


I think 2 performs best as a Cs hyperparameter, with a score of around 0.76

## Testing

In [21]:
predictions = lr.predict(X_test_scaled)
# print(predictions)
# print(y_test)

In [22]:
score = lr.score(X_test_scaled,y_test)
print(score)

0.7974683544303798


In [23]:
confusion_matrix(y_test, predictions)

array([[50,  4],
       [12, 13]])

In [24]:
# 1 is perfect, and -1 is completely wrong
matthews_corrcoef(y_test, predictions)

0.5046721063439498