In [1]:
import random
random.seed(2024)

import missingno as msno
import numpy as np
from scipy.stats import shapiro
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from fasteda import fast_eda

## Loading the Diabetes Data set
#### Load the dataset from the sklearn and also shows the description of the data

In [2]:
diabetes_X, diabetes_y = load_diabetes(return_X_y=True, as_frame=True, scaled = False)

#Using concat() to concatenate the two pandas DataFrames into one
diabetes = pd.concat([diabetes_X, pd.Series(diabetes_y)], axis = 1).rename({0: 'target'}, axis = 1)

#Load the dataset using another method in order to show the data description
#The data set description of the data
diabetes_default = load_diabetes()

#Show the dataset description
print(diabetes_default['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

#### Looking at 5 randomly selected rows in the data set

In [4]:
diabetes.sample(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
306,51.0,2.0,26.2,101.0,161.0,99.6,48.0,3.0,4.2047,88.0,44.0
429,37.0,1.0,23.3,88.0,223.0,142.0,65.0,3.4,4.3567,82.0,94.0
81,52.0,2.0,24.3,86.0,197.0,133.6,44.0,5.0,4.5747,91.0,51.0
246,60.0,1.0,23.4,76.67,247.0,148.0,65.0,3.8,5.1358,77.0,78.0
210,51.0,1.0,27.7,99.0,229.0,145.6,69.0,3.0,4.2767,77.0,191.0


#### The diabetes dataset has 10 features and a *target* column
#### Our task is to predict the measure of disease progression on the basis of the 10 observed features
#### The dataset features are mostly numeric. Only the *sex* feature is in categorical format. 

## Check if there is some missing values

In [5]:
diabetes.isna().max(axis=0).max()

False

#### The original dataset does not contain any missing values. To challenge myself for this project, lets introduce some missing values. 

In [8]:
#Selecting 3 feature columns at random
#Set them to missing for 10% of randomly selected rows

#Set the random seed for reproducibility
random.seed(2024)

#Select 3 columns at random
missing_cols = random.sample(range (len(diabetes.columns) -1),3)

#Select 10% of the rows at random
missing_rows = random.sample(diabetes.index.tolist(), int(np.round(len(diabetes.index.tolist())/10)))

#Set the 3 selected columns to missing values for the 10% of selected rows
diabetes.iloc[missing_rows, missing_cols] = np.nan

#### Which columns were randomly selected?

In [9]:
print(sorted(diabetes.columns[missing_cols]))

['bmi', 's1', 's4']


## Initial Data Preprocessing
### Using one-hot encoding for the *sex* column

In [10]:
#Initiliaze the OneHotEncoder()
enc1 = OneHotEncoder(handle_unknown='ignore', drop = None)

#One-hot encode 'sex'; the output is a numpy array
encoded_sex = enc1.fit_transform(diabetes[['sex']]).toarray()

#Convert numpy array to pandas DataFrame with columns names based on original category labels
encoded_sex = pd.DataFrame(encoded_sex, columns=['sex' + str(int(x)) for x in enc1.categories_[0]])

#Horizontanlly concatenate the original 'diabetes' data set with the two one-hot columns
diabetes = pd.concat([diabetes, encoded_sex], axis=1)

#Sample 10 rows. I will only use 'sex', 'sex1', and 'sex2' columns for simplicity
diabetes[['sex', 'sex1', 'sex2']].sample(10)

Unnamed: 0,sex,sex1,sex2
426,2.0,0.0,1.0
181,1.0,1.0,0.0
243,2.0,0.0,1.0
362,2.0,0.0,1.0
10,1.0,1.0,0.0
89,1.0,1.0,0.0
147,1.0,1.0,0.0
259,1.0,1.0,0.0
227,2.0,0.0,1.0
400,1.0,1.0,0.0


#### We only need one column to keep and that is *sex1* column. The remaining two columns *sex* and *sex2* will be dropped. 

In [12]:
#Drop 'sex' and 'sex2'
diabetes = diabetes.drop(['sex', 'sex2'], axis =1)
#Reorder columns to have 'sex1' where 'sex' used to be
diabetes = diabetes.loc[:, ['age', 'sex1', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'target']]
#Print a sample of 5 ros
diabetes.sample(5)

Unnamed: 0,age,sex1,bmi,bp,s1,s2,s3,s4,s5,s6,target
111,54.0,1.0,,87.0,,122.0,68.0,,4.382,80.0,59.0
41,21.0,1.0,20.1,63.0,135.0,69.0,54.0,3.0,4.0943,89.0,55.0
330,51.0,0.0,29.2,107.0,187.0,139.0,32.0,6.0,4.382,95.0,244.0
365,58.0,1.0,22.8,91.0,196.0,118.8,48.0,4.0,4.9836,115.0,206.0
139,55.0,1.0,32.1,110.0,164.0,84.2,42.0,4.0,5.2417,90.0,281.0


#### Before performing EDA for diabetes dataset we need to split the data first
## Train-test split

In [13]:
#Make a train-test split
X_train, X_test, y_train, y_test = train_test_split(diabetes.iloc[:, :-1], diabetes.iloc[:, [-1]], test_size = 0.33, random_state = 2204)

## Perform EDA

In [14]:
diabetes.head()

Unnamed: 0,age,sex1,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,59.0,0.0,32.1,101.0,157.0,93.2,38.0,4.0,4.8598,87.0,151.0
1,48.0,1.0,21.6,87.0,183.0,103.2,70.0,3.0,3.8918,69.0,75.0
2,72.0,0.0,30.5,93.0,156.0,93.6,41.0,4.0,4.6728,85.0,141.0
3,24.0,1.0,25.3,84.0,198.0,131.4,40.0,5.0,4.8903,89.0,206.0
4,50.0,1.0,23.0,101.0,192.0,125.4,52.0,4.0,4.2905,80.0,135.0


In [15]:
diabetes.tail()

Unnamed: 0,age,sex1,bmi,bp,s1,s2,s3,s4,s5,s6,target
437,60.0,0.0,28.2,112.0,185.0,113.8,42.0,4.0,4.9836,93.0,178.0
438,47.0,0.0,24.9,75.0,225.0,166.0,42.0,5.0,4.4427,102.0,104.0
439,60.0,0.0,24.9,99.67,162.0,106.6,43.0,3.77,4.1271,95.0,132.0
440,36.0,1.0,30.0,95.0,201.0,125.2,42.0,4.79,5.1299,85.0,220.0
441,36.0,1.0,19.6,71.0,250.0,133.2,97.0,3.0,4.5951,92.0,57.0


In [16]:
diabetes.describe()

Unnamed: 0,age,sex1,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,398.0,442.0,398.0,442.0,442.0,398.0,442.0,442.0,442.0
mean,48.5181,0.531674,26.425879,94.647014,189.266332,115.43914,49.788462,4.075075,4.641411,91.260181,152.133484
std,13.109028,0.499561,4.445336,13.831283,34.391866,30.413081,12.934202,1.282526,0.522391,11.496335,77.093005
min,19.0,0.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0
25%,38.25,0.0,23.125,84.0,165.0,96.05,40.25,3.0,4.2767,83.25,87.0
50%,50.0,1.0,25.8,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5
75%,59.0,1.0,29.5,105.0,209.0,134.5,57.75,5.0,4.9972,98.0,211.5
max,79.0,1.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0
