In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
dataset = pd.read_csv('dataset.csv')

In [4]:
dataset.head()

Unnamed: 0,General_Health,Exercise,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Heart_Disease
0,Poor,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12,No
1,Very Good,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4,Yes
2,Very Good,Yes,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16,No
3,Poor,Yes,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8,Yes
4,Good,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0,No


# X features, y-dependency

In [5]:
X = dataset.iloc[:, :-1].values 
y = dataset.iloc[:, -1].values

In [8]:
print(X)

[['Poor' 'No' 'No' ... 30 16 12]
 ['Very Good' 'No' 'No' ... 30 0 4]
 ['Very Good' 'Yes' 'No' ... 12 3 16]
 ...
 ['Very Good' 'Yes' 'Yes' ... 40 8 4]
 ['Very Good' 'Yes' 'No' ... 30 12 0]
 ['Excellent' 'Yes' 'No' ... 5 12 1]]


In [9]:
print(y)

['No' 'Yes' 'No' ... 'No' 'No' 'No']


NO Missing values in Dataset

In [13]:
print(dataset.isnull().any())

General_Health                  False
Exercise                        False
Depression                      False
Diabetes                        False
Arthritis                       False
Sex                             False
Age_Category                    False
Height_(cm)                     False
Weight_(kg)                     False
BMI                             False
Smoking_History                 False
Alcohol_Consumption             False
Fruit_Consumption               False
Green_Vegetables_Consumption    False
FriedPotato_Consumption         False
Heart_Disease                   False
dtype: bool


In [14]:
dataset.General_Health.unique()

array(['Poor', 'Very Good', 'Good', 'Fair', 'Excellent'], dtype=object)

In [15]:
dataset.Age_Category.unique()

array(['70-74', '60-64', '75-79', '80+', '65-69', '50-54', '45-49',
       '18-24', '30-34', '55-59', '35-39', '40-44', '25-29'], dtype=object)

Encoding Categorical Data - One Hot Encoding for General Heath Column

It can improve model performance by providing more information to the model about the categorical variable. It can help to avoid the problem of ordinality, which can occur when a categorical variable has a natural ordering

In [27]:
one_hot_encoded_data = pd.get_dummies(dataset, columns = ['General_Health'])
print(one_hot_encoded_data)

       Exercise Depression                                    Diabetes  \
0            No         No                                          No   
1            No         No                                         Yes   
2           Yes         No                                         Yes   
3           Yes         No                                         Yes   
4            No         No                                          No   
...         ...        ...                                         ...   
308849      Yes         No                                          No   
308850      Yes         No                                         Yes   
308851      Yes        Yes  Yes, but female told only during pregnancy   
308852      Yes         No                                          No   
308853      Yes         No                                          No   

       Arthritis     Sex Age_Category  Height_(cm)  Weight_(kg)    BMI  \
0            Yes  Female        70-74

In [28]:
dataset = one_hot_encoded_data

Converting Yes to 1 and No to 0, Male -1, Female -0

In [29]:
dataset['Exercise'] = dataset['Exercise'].map({'Yes':1 ,'No':0})
dataset['Depression'] = dataset['Depression'].map({'Yes':1 ,'No':0})
dataset['Diabetes'] = dataset['Diabetes'].map({'Yes':1 ,'No':0})
dataset['Arthritis'] = dataset['Arthritis'].map({'Yes':1 ,'No':0})
dataset['Smoking_History'] = dataset['Smoking_History'].map({'Yes':1 ,'No':0})
dataset['Heart_Disease'] = dataset['Heart_Disease'].map({'Yes':1 ,'No':0})
dataset['Sex'] = dataset['Sex'].map({'Male':1 ,'Female':0})

In [30]:
print(dataset)

        Exercise  Depression  Diabetes  Arthritis  Sex Age_Category  \
0              0           0       0.0          1    0        70-74   
1              0           0       1.0          0    0        70-74   
2              1           0       1.0          0    0        60-64   
3              1           0       1.0          0    1        75-79   
4              0           0       0.0          0    1          80+   
...          ...         ...       ...        ...  ...          ...   
308849         1           0       0.0          0    1        25-29   
308850         1           0       1.0          0    1        65-69   
308851         1           1       NaN          0    0        30-34   
308852         1           0       0.0          0    1        65-69   
308853         1           0       0.0          0    0        45-49   

        Height_(cm)  Weight_(kg)    BMI  Smoking_History  Alcohol_Consumption  \
0               150        32.66  14.54                1          

In [31]:
dataset.Age_Category.unique()

array(['70-74', '60-64', '75-79', '80+', '65-69', '50-54', '45-49',
       '18-24', '30-34', '55-59', '35-39', '40-44', '25-29'], dtype=object)

In [32]:
dataset['Age_Category'] = dataset['Age_Category'].map({'18-24':21 ,'25-29':27,'30-34':32 ,'35-39':37,'40-44':42 ,'45-49':47,'50-54':52 ,'55-59':57,'60-64':62 ,'65-69':67,'70-74':72 ,'75-79':77,'80+':80})

In [33]:
print(dataset)

        Exercise  Depression  Diabetes  Arthritis  Sex  Age_Category  \
0              0           0       0.0          1    0            72   
1              0           0       1.0          0    0            72   
2              1           0       1.0          0    0            62   
3              1           0       1.0          0    1            77   
4              0           0       0.0          0    1            80   
...          ...         ...       ...        ...  ...           ...   
308849         1           0       0.0          0    1            27   
308850         1           0       1.0          0    1            67   
308851         1           1       NaN          0    0            32   
308852         1           0       0.0          0    1            67   
308853         1           0       0.0          0    0            47   

        Height_(cm)  Weight_(kg)    BMI  Smoking_History  Alcohol_Consumption  \
0               150        32.66  14.54               

In [35]:
dataset.to_csv('processedDatSet.csv')