In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<ol>
<li>age (Age in years)</li>
<li>sex : (1 = male, 0 = female)</li>
<li>cp (Chest Pain Type): [ 0: asymptomatic, 1: atypical angina, 2:non-anginal pain, 3: typical angina]
<li>trestbps (Resting Blood Pressure in mm/hg )
<li>chol (Serum Cholesterol in mg/dl)
<li>fps (Fasting Blood Sugar > 120 mg/dl): [0 = no, 1 = yes]
<li>restecg (Resting ECG): [0: showing probable or definite left ventricular hypertrophy by Estes’ criteria, 1: normal, 2: having ST-T wave abnormality]
<li>thalach (maximum heart rate achieved)
<li>exang (Exercise Induced Angina): [1 = yes, 0 = no]
<li>oldpeak (ST depression induced by exercise relative to rest)
<li>slope (the slope of the peak exercise ST segment): [0: downsloping; 1: flat; 2: upsloping]
<li>ca [number of major vessels (0–3)
<li>thal : [1 = normal, 2 = fixed defect, 3 = reversible defect]
<li>target: [0 = disease, 1 = no disease]
</ol>


In [4]:
#url='/content/drive/MyDrive/heart.csv'
import pandas as pd
import numpy as np
df=pd.read_csv('/home/siddhi/Heart.csv')

In [None]:
df.shape

In [None]:
df.info()

#Check data type

##The variables types are
<ul>
<li>Binary: sex, fbs, exang, target
<li>Categorical: cp, restecg, slope, ca, thal
<li>Continuous: age, trestbps, chol, thalac, oldpeak

In [None]:
df.dtypes

In [None]:
# to know unique values 
df.nunique()

In [None]:
df.head()

In [None]:
# change the categorical type to categorical variables
df['sex'] = df['sex'].astype('object')
df['cp'] = df['cp'].astype('object')
df['fbs'] = df['fbs'].astype('object')
df['restecg'] = df['restecg'].astype('object')
df['exang'] = df['exang'].astype('object')
df['slope'] = df['slope'].astype('object')
df['ca'] = df['ca'].astype('object')
df['thal'] = df['thal'].astype('object')
df.dtypes

#Error Correction

##Check for the data characters mistakes
###feature ‘ca’ ranges from 0–3, however, df.nunique() listed 0–4. So lets find the ‘4’ and change them to NaN.

In [None]:
df['ca'].unique()

In [None]:
# to count the number in of each category decending order
df.ca.value_counts()

In [None]:
df[df['ca']==4]

In [None]:
df.loc[df['ca']==4,'ca']=np.NaN

In [None]:
df['ca'].unique()

### Feature ‘thal’ ranges from 1–3, however, df.nunique() listed 0–3. There are two values of ‘0’. So lets change them to NaN

In [None]:
df.thal.value_counts()

In [None]:
df.loc[df['thal']==0,'thal']=np.NaN

In [None]:
df[df['thal']==0]

In [None]:
df['thal'].unique()

###Check for missing values and replace them

In [None]:
df.isna().sum()

In [None]:
df = df.fillna(df.median())
df.isnull().sum()

###Check for duplicate rows

In [None]:
duplicated=df.duplicated().sum()
if duplicated:
  print("Duplicated rows :{}".format(duplicated))
else:
  print("No duplicates")


In [None]:
duplicates=df[df.duplicated(keep=False)]
duplicates.head()

###statistical summary
<ol>
<li>check on the min and max value for the categorical variables (min-max). Sex (0–1), cp (0–3), fbs (0–1), restecg (0–2), exang (0–1), slope (0–2), ca (0–3), thal (0–3). 
<li>Observe the mean, std, 25% and 75% on the continuous variables.

In [None]:
df.describe()

####Before we plot the outliers, let's change the labeling for better visualization and interpretation.

In [None]:
df['target'] = df.target.replace({1: "Disease", 0: "No_disease"})
df['sex'] = df.sex.replace({1: "Male", 0: "Female"})
df['cp'] = df.cp.replace({0: "typical_angina", 
                          1: "atypical_angina", 
                          2:"non-anginal pain",
                          3: "asymtomatic"})
df['exang'] = df.exang.replace({1: "Yes", 0: "No"})
df['fbs'] = df.fbs.replace({1: "True", 0: "False"})
df['slope'] = df.slope.replace({0: "upsloping", 1: "flat",2:"downsloping"})
df['thal'] = df.thal.replace({1: "fixed_defect", 2: "reversable_defect", 3:"normal"})

###Outliers Detection & Handling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
bxplt = sb.boxplot(df["target"],df["chol"])
plt.show()

In [None]:
sb.boxplot(x='target', y='oldpeak', data=df)

In [None]:
# define continuous variable & plot
continous_features = ['age','trestbps','chol','thalach','oldpeak']  
def outliers(df_out, drop = False):
    for each_feature in df_out.columns:
        feature_data = df_out[each_feature]
        Q1 = np.percentile(feature_data, 25.) # 25th percentile of the data of the given feature
        Q3 = np.percentile(feature_data, 75.) # 75th percentile of the data of the given feature
        IQR = Q3-Q1 #Interquartile Range
        outlier_step = IQR * 1.5 #That's we were talking about above
        outliers = feature_data[~((feature_data >= Q1 - outlier_step) & (feature_data <= Q3 + outlier_step))].index.tolist()  
        if not drop:
            print('For the feature {}, No of Outliers is {}'.format(each_feature, len(outliers)))
        if drop:
            df.drop(outliers, inplace = True, errors = 'ignore')
            print('Outliers from {} feature removed'.format(each_feature))

outliers(df[continous_features])

**Drop Outliers**

In [None]:
outliers(df[continous_features],drop=True)