In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.api import OLS
from statsmodels.formula.api import ols
from statsmodels.api import add_constant
raw_file = Path.cwd() / 'datasets/raw_data.csv'

In [2]:
df = pd.read_csv(raw_file)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5109 non-null   int64  
 1   gender             5109 non-null   object 
 2   age                5109 non-null   float64
 3   hypertension       5109 non-null   int64  
 4   heart_disease      5109 non-null   int64  
 5   ever_married       5109 non-null   object 
 6   work_type          5109 non-null   object 
 7   Residence_type     5109 non-null   object 
 8   avg_glucose_level  5109 non-null   float64
 9   bmi                5109 non-null   float64
 10  smoking_status     5109 non-null   object 
 11  stroke             5109 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 518.9+ KB


## Cleaning up the data

In [4]:
#we have gender unknown to clean up (the number is only one), and BMI nan is replaced by the average of BMI

In [7]:
df.bmi.fillna(28.1, inplace=True)

In [None]:
df.drop(df.loc[df['gender']=='Other'].index, inplace=True)

## Assumpsions

In [6]:
sns.set()

In [16]:
df['age_bin']=pd.cut(df.age,[0,30,40,50,70,100]).cat.codes
#or we can do df['age_bin']=pd.cut(df.age,5).cat.codes as well

### ols

In [18]:
model=ols('avg_glucose_level ~ C(age_bin)', data=df)
model_fit=model.fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,avg_glucose_level,R-squared:,0.063
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,85.15
Date:,"Fri, 12 Mar 2021",Prob (F-statistic):,4.0400000000000004e-70
Time:,10:20:58,Log-Likelihood:,-26564.0
No. Observations:,5109,AIC:,53140.0
Df Residuals:,5104,BIC:,53170.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,94.2275,1.107,85.093,0.000,92.057,96.398
C(age_bin)[T.1],4.7107,2.020,2.332,0.020,0.750,8.671
C(age_bin)[T.2],6.7837,1.957,3.466,0.001,2.947,10.620
C(age_bin)[T.3],22.2468,1.607,13.840,0.000,19.095,25.398
C(age_bin)[T.4],29.7905,1.984,15.016,0.000,25.901,33.680

0,1,2,3
Omnibus:,1008.788,Durbin-Watson:,1.968
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1730.445
Skew:,1.304,Prob(JB):,0.0
Kurtosis:,4.153,Cond. No.,5.0


In [19]:
#with log BMI

In [20]:
model=ols('avg_glucose_level ~ np.log(bmi) + hypertension + heart_disease + stroke +age ', data=df)
model_fit=model.fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,avg_glucose_level,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.09
Method:,Least Squares,F-statistic:,101.9
Date:,"Fri, 12 Mar 2021",Prob (F-statistic):,8.810000000000001e-103
Time:,10:22:27,Log-Likelihood:,-26486.0
No. Observations:,5109,AIC:,52980.0
Df Residuals:,5103,BIC:,53020.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.0002,8.067,4.463,0.000,20.186,51.814
np.log(bmi),16.7311,2.542,6.582,0.000,11.748,21.714
hypertension,15.8457,2.130,7.438,0.000,11.669,20.022
heart_disease,20.4752,2.785,7.352,0.000,15.015,25.935
stroke,14.4386,2.913,4.956,0.000,8.728,20.150
age,0.2566,0.032,8.141,0.000,0.195,0.318

0,1,2,3
Omnibus:,958.156,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1616.096
Skew:,1.245,Prob(JB):,0.0
Kurtosis:,4.179,Cond. No.,684.0


In [23]:
#for categorical datas we use dummies
df=pd.get_dummies(data=df, columns=['work_type','Residence_type','ever_married','smoking_status','gender'], drop_first=True)

In [27]:
df

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,age_bin,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,ever_married_Yes,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,gender_Male
0,9046,67.0,0,1,228.69,36.6,1,3,0,1,0,0,1,1,1,0,0,1
1,51676,61.0,0,0,202.21,28.1,1,3,0,0,1,0,0,1,0,1,0,0
2,31112,80.0,0,1,105.92,32.5,1,4,0,1,0,0,0,1,0,1,0,1
3,60182,49.0,0,0,171.23,34.4,1,2,0,1,0,0,1,1,0,0,1,0
4,1665,79.0,1,0,174.12,24.0,1,4,0,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,28.1,0,4,0,1,0,0,1,1,0,1,0,0
5106,44873,81.0,0,0,125.20,40.0,0,4,0,0,1,0,1,1,0,1,0,0
5107,19723,35.0,0,0,82.99,30.6,0,1,0,0,1,0,0,1,0,1,0,0
5108,37544,51.0,0,0,166.29,25.6,0,3,0,1,0,0,0,1,1,0,0,1


### OLS 

In [30]:
OLS(df.avg_glucose_level, df._get_numeric_data().drop(['avg_glucose_level','id','age_bin'], axis=1)).fit().summary()

0,1,2,3
Dep. Variable:,avg_glucose_level,R-squared (uncentered):,0.855
Model:,OLS,Adj. R-squared (uncentered):,0.855
Method:,Least Squares,F-statistic:,2010.0
Date:,"Fri, 12 Mar 2021",Prob (F-statistic):,0.0
Time:,11:53:34,Log-Likelihood:,-26567.0
No. Observations:,5109,AIC:,53160.0
Df Residuals:,5094,BIC:,53260.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,0.6681,0.043,15.561,0.000,0.584,0.752
hypertension,9.7818,2.164,4.520,0.000,5.539,14.025
heart_disease,15.2717,2.854,5.351,0.000,9.676,20.867
bmi,1.6116,0.069,23.501,0.000,1.477,1.746
stroke,10.5350,2.972,3.545,0.000,4.708,16.362
work_type_Never_worked,33.5423,9.518,3.524,0.000,14.884,52.201
work_type_Private,14.3803,1.700,8.457,0.000,11.047,17.714
work_type_Self-employed,8.6595,2.241,3.863,0.000,4.265,13.054
work_type_children,51.9296,2.176,23.866,0.000,47.664,56.195

0,1,2,3
Omnibus:,623.541,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,882.238
Skew:,0.943,Prob(JB):,2.66e-192
Kurtosis:,3.765,Cond. No.,868.0


In [32]:
df2 = pd.read_csv('/Users/teahupoo20/Documents/GitHub/STROKE_PROJECT/datasets/raw_data.csv')
df2.drop(['id','smoking_status'], axis=1, inplace=True)
df2.bmi.fillna(28.1, inplace=True)
df2.drop(df2.loc[df2['gender']=='Other'].index, inplace=True)
df2=pd.get_dummies(data=df2, columns=['Residence_type','gender','ever_married','work_type'], drop_first=True)

In [35]:
X, y = add_constant(df2).drop('avg_glucose_level', axis=1), df2.avg_glucose_level # features, target
model_fit=OLS(y,X).fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,avg_glucose_level,R-squared:,0.101
Model:,OLS,Adj. R-squared:,0.098
Method:,Least Squares,F-statistic:,47.48
Date:,"Fri, 12 Mar 2021",Prob (F-statistic):,4.9e-108
Time:,11:55:07,Log-Likelihood:,-26459.0
No. Observations:,5109,AIC:,52940.0
Df Residuals:,5096,BIC:,53030.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,59.2239,3.688,16.058,0.000,51.993,66.454
age,0.4061,0.045,8.979,0.000,0.317,0.495
hypertension,14.4107,2.133,6.757,0.000,10.230,18.592
heart_disease,18.4361,2.799,6.587,0.000,12.950,23.923
bmi,0.7543,0.088,8.535,0.000,0.581,0.928
stroke,13.1029,2.914,4.497,0.000,7.390,18.815
Residence_type_Urban,-0.6784,1.204,-0.563,0.573,-3.039,1.682
gender_Male,3.9853,1.233,3.233,0.001,1.568,6.402
ever_married_Yes,0.5705,1.786,0.319,0.749,-2.931,4.072

0,1,2,3
Omnibus:,919.757,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1524.009
Skew:,1.208,Prob(JB):,0.0
Kurtosis:,4.148,Cond. No.,886.0


In [37]:
#From the first impression, we will clean up the variables which have p>0.05
columns_to_drop=['work_type_Never_worked','work_type_Private','work_type_Self-employed','work_type_children','Residence_type_Urban']
model_fit=OLS(y, X.drop(columns_to_drop, axis=1)).fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,avg_glucose_level,R-squared:,0.095
Model:,OLS,Adj. R-squared:,0.093
Method:,Least Squares,F-statistic:,76.24
Date:,"Fri, 12 Mar 2021",Prob (F-statistic):,1.68e-105
Time:,11:56:16,Log-Likelihood:,-26475.0
No. Observations:,5109,AIC:,52970.0
Df Residuals:,5101,BIC:,53020.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.6831,2.487,28.819,0.000,66.807,76.559
age,0.2860,0.040,7.224,0.000,0.208,0.364
hypertension,15.2686,2.131,7.165,0.000,11.091,19.446
heart_disease,19.4796,2.800,6.957,0.000,13.990,24.969
bmi,0.6083,0.084,7.208,0.000,0.443,0.774
stroke,14.2838,2.914,4.903,0.000,8.572,19.996
gender_Male,4.6018,1.232,3.737,0.000,2.187,7.016
ever_married_Yes,-0.9235,1.766,-0.523,0.601,-4.386,2.539

0,1,2,3
Omnibus:,940.518,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1573.456
Skew:,1.228,Prob(JB):,0.0
Kurtosis:,4.165,Cond. No.,275.0
