### Import packages

syntax:
- `from ... import ...`
- `import ... (as ...)`

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

### Load Dataset

syntax:
- `df = pd.read_csv("filename or path to file")`

In [2]:
df = pd.read_csv("./diabetes_dataset__2019.csv")

In [3]:
df.head()

Unnamed: 0,Age,Gender,Family_Diabetes,highBP,PhysicallyActive,BMI,Smoking,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic
0,50-59,Male,no,yes,one hr or more,39.0,no,no,8.0,6.0,no,occasionally,sometimes,high,0.0,0,not much,no
1,50-59,Male,no,yes,less than half an hr,28.0,no,no,8.0,6.0,yes,very often,sometimes,normal,0.0,0,not much,no
2,40-49,Male,no,no,one hr or more,24.0,no,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
3,50-59,Male,no,no,one hr or more,23.0,no,no,8.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
4,40-49,Male,no,no,less than half an hr,27.0,no,no,8.0,8.0,no,occasionally,sometimes,normal,0.0,0,not much,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               953 non-null    object 
 1   Gender            952 non-null    object 
 2   Family_Diabetes   952 non-null    object 
 3   highBP            952 non-null    object 
 4   PhysicallyActive  952 non-null    object 
 5   BMI               948 non-null    float64
 6   Smoking           952 non-null    object 
 7   Alcohol           952 non-null    object 
 8   Sleep             952 non-null    float64
 9   SoundSleep        952 non-null    float64
 10  RegularMedicine   952 non-null    object 
 11  JunkFood          952 non-null    object 
 12  Stress            952 non-null    object 
 13  BPLevel           952 non-null    object 
 14  Pregancies        910 non-null    float64
 15  Pdiabetes         951 non-null    object 
 16  UriationFreq      952 non-null    object 
 1

In [5]:
df.shape

(953, 18)

#### syntax of retrieving one column of data
- `df_name["column_name"]`

In [6]:
df["Pdiabetes"]

0        0
1        0
2        0
3        0
4        0
      ... 
948      0
949      0
950      0
951      0
952    NaN
Name: Pdiabetes, Length: 953, dtype: object

### Data Cleaning

#### drop missing values (whole row/observation)
- `df = df_name.dropna()`

In [7]:
df = df.dropna()

In [8]:
df.shape

(905, 18)

#### Example of using `pd.get_dummies()`

In [9]:
sex = ['male', 'male', 'female', 'male', 'female']
s = pd.DataFrame(sex, columns = ['sex'])

In [10]:
s

Unnamed: 0,sex
0,male
1,male
2,female
3,male
4,female


In [11]:
s = pd.get_dummies(s, columns = ['sex'])
s

Unnamed: 0,sex_female,sex_male
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0


#### Converting categorical variables into dummy variables

- `df_name = pd.get_dummies(df_name, columns = [column_names])`



In [12]:
df = pd.get_dummies(df, columns = ['Gender', 'Smoking'])
df

Unnamed: 0,Age,Family_Diabetes,highBP,PhysicallyActive,BMI,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic,Gender_Female,Gender_Male,Smoking_no,Smoking_yes
0,50-59,no,yes,one hr or more,39.0,no,8.0,6.0,no,occasionally,sometimes,high,0.0,0,not much,no,0,1,1,0
1,50-59,no,yes,less than half an hr,28.0,no,8.0,6.0,yes,very often,sometimes,normal,0.0,0,not much,no,0,1,1,0
2,40-49,no,no,one hr or more,24.0,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no,0,1,1,0
3,50-59,no,no,one hr or more,23.0,no,8.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no,0,1,1,0
4,40-49,no,no,less than half an hr,27.0,no,8.0,8.0,no,occasionally,sometimes,normal,0.0,0,not much,no,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,less than 40,yes,no,more than half an hr,25.0,no,8.0,6.0,no,often,sometimes,normal,0.0,0,not much,yes,0,1,1,0
948,60 or older,yes,yes,more than half an hr,27.0,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0,quite often,yes,0,1,1,0
949,60 or older,no,yes,none,23.0,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0,not much,no,0,1,1,0
950,60 or older,no,yes,less than half an hr,27.0,yes,6.0,5.0,yes,occasionally,very often,high,0.0,0,not much,no,0,1,1,0


#### Converting target variable \(yes/no\) into 0s and 1s



In [13]:
df['Diabetic'] = df['Diabetic'].replace('no', 0)
df['Diabetic'] = df['Diabetic'].replace(' no', 0)
df['Diabetic'] = df['Diabetic'].replace('yes', 1)
df['Diabetic']

0      0
1      0
2      0
3      0
4      0
      ..
947    1
948    1
949    0
950    0
951    1
Name: Diabetic, Length: 905, dtype: int64

### Split data into training and testing

- `train, test = train_test_split(df_name, test_size, shuffle = True)`



In [14]:
train, test = train_test_split(df, test_size = 0.2, shuffle = True)

In [15]:
train

Unnamed: 0,Age,Family_Diabetes,highBP,PhysicallyActive,BMI,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic,Gender_Female,Gender_Male,Smoking_no,Smoking_yes
478,40-49,no,no,one hr or more,24.0,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,0,0,1,1,0
937,40-49,yes,yes,more than half an hr,33.0,no,7.0,2.0,yes,occasionally,sometimes,high,0.0,0,quite often,1,1,0,1,0
102,less than 40,no,yes,less than half an hr,24.0,no,8.0,8.0,no,occasionally,very often,normal,0.0,0,not much,0,0,1,1,0
312,50-59,yes,yes,none,31.0,yes,7.0,6.0,yes,occasionally,always,high,0.0,0,not much,1,0,1,1,0
736,less than 40,yes,no,one hr or more,24.0,no,4.0,4.0,yes,often,always,normal,3.0,0,not much,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,less than 40,no,no,less than half an hr,29.0,yes,6.0,5.0,yes,often,very often,normal,0.0,0,not much,0,0,1,0,1
929,less than 40,yes,no,more than half an hr,29.0,no,6.0,3.0,no,occasionally,sometimes,normal,0.0,0,quite often,0,1,0,1,0
678,less than 40,no,no,one hr or more,25.0,no,6.0,6.0,no,always,always,normal,0.0,0,not much,0,0,1,1,0
666,less than 40,no,no,less than half an hr,24.0,yes,10.0,7.0,no,always,always,normal,0.0,0,not much,0,0,1,0,1


In [16]:
test

Unnamed: 0,Age,Family_Diabetes,highBP,PhysicallyActive,BMI,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic,Gender_Female,Gender_Male,Smoking_no,Smoking_yes
949,60 or older,no,yes,none,23.0,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0,not much,0,0,1,1,0
41,60 or older,no,no,none,24.0,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,0,0,1,1,0
360,less than 40,no,yes,more than half an hr,26.0,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,quite often,0,0,1,1,0
797,less than 40,no,no,less than half an hr,28.0,yes,4.0,4.0,no,occasionally,very often,normal,0.0,0,not much,0,0,1,0,1
497,less than 40,yes,no,one hr or more,24.0,no,4.0,4.0,yes,often,always,normal,3.0,0,quite often,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,less than 40,no,no,more than half an hr,24.0,no,8.0,5.0,no,often,not at all,normal,0.0,0,not much,0,0,1,1,0
709,less than 40,yes,no,more than half an hr,22.0,no,8.0,6.0,no,often,sometimes,normal,0.0,0,not much,1,0,1,1,0
791,50-59,no,yes,none,20.0,no,7.0,4.0,yes,occasionally,sometimes,high,1.0,0,not much,1,1,0,1,0
600,50-59,no,yes,less than half an hr,33.0,yes,5.0,5.0,yes,occasionally,sometimes,high,0.0,0,not much,0,0,1,1,0


### Model fitting

#### example of using logistic regression

In [17]:
xcols = ['BMI']
lr = LogisticRegression()
lr.fit(train[xcols], train['Diabetic'])
lr.score(test[xcols], test['Diabetic'])

0.7348066298342542

In [19]:
count = 0
coun = 0
for i in df['Diabetic']:
    if i == 0:
        count += 1
    else:
        coun += 1


In [20]:
count

642

In [21]:
coun

263