In [67]:
# Imports 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


sns.set()
%matplotlib inline

In [68]:
# Loading the dataset
filepath = 'stroke_prediction_dataset.csv'
strokeData = pd.read_csv(filepath)
strokeData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                15000 non-null  int64  
 1   Patient Name              15000 non-null  object 
 2   Age                       15000 non-null  int64  
 3   Gender                    15000 non-null  object 
 4   Hypertension              15000 non-null  int64  
 5   Heart Disease             15000 non-null  int64  
 6   Marital Status            15000 non-null  object 
 7   Work Type                 15000 non-null  object 
 8   Residence Type            15000 non-null  object 
 9   Average Glucose Level     15000 non-null  float64
 10  Body Mass Index (BMI)     15000 non-null  float64
 11  Smoking Status            15000 non-null  object 
 12  Alcohol Intake            15000 non-null  object 
 13  Physical Activity         15000 non-null  object 
 14  Stroke

In [69]:
strokeData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                15000 non-null  int64  
 1   Patient Name              15000 non-null  object 
 2   Age                       15000 non-null  int64  
 3   Gender                    15000 non-null  object 
 4   Hypertension              15000 non-null  int64  
 5   Heart Disease             15000 non-null  int64  
 6   Marital Status            15000 non-null  object 
 7   Work Type                 15000 non-null  object 
 8   Residence Type            15000 non-null  object 
 9   Average Glucose Level     15000 non-null  float64
 10  Body Mass Index (BMI)     15000 non-null  float64
 11  Smoking Status            15000 non-null  object 
 12  Alcohol Intake            15000 non-null  object 
 13  Physical Activity         15000 non-null  object 
 14  Stroke

In [70]:
# Dropping rows/columns with missing values
# strokeData.isnull().sum()
strokeData = strokeData.dropna()
len(strokeData)
# No of entries gone from 15,000 to 12,500

12500

In [71]:
strokeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12500 entries, 0 to 14999
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                12500 non-null  int64  
 1   Patient Name              12500 non-null  object 
 2   Age                       12500 non-null  int64  
 3   Gender                    12500 non-null  object 
 4   Hypertension              12500 non-null  int64  
 5   Heart Disease             12500 non-null  int64  
 6   Marital Status            12500 non-null  object 
 7   Work Type                 12500 non-null  object 
 8   Residence Type            12500 non-null  object 
 9   Average Glucose Level     12500 non-null  float64
 10  Body Mass Index (BMI)     12500 non-null  float64
 11  Smoking Status            12500 non-null  object 
 12  Alcohol Intake            12500 non-null  object 
 13  Physical Activity         12500 non-null  object 
 14  Stroke Hist

In [72]:
# Splitting column 'Blood Pressure Levels' into two seperate columns, removing original
strokeData[['Systolic_BP', 'Diastolic_BP']] = strokeData['Blood Pressure Levels'].str.split('/', expand = True)
strokeData = strokeData.drop(columns = ['Blood Pressure Levels'])
strokeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12500 entries, 0 to 14999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                12500 non-null  int64  
 1   Patient Name              12500 non-null  object 
 2   Age                       12500 non-null  int64  
 3   Gender                    12500 non-null  object 
 4   Hypertension              12500 non-null  int64  
 5   Heart Disease             12500 non-null  int64  
 6   Marital Status            12500 non-null  object 
 7   Work Type                 12500 non-null  object 
 8   Residence Type            12500 non-null  object 
 9   Average Glucose Level     12500 non-null  float64
 10  Body Mass Index (BMI)     12500 non-null  float64
 11  Smoking Status            12500 non-null  object 
 12  Alcohol Intake            12500 non-null  object 
 13  Physical Activity         12500 non-null  object 
 14  Stroke Hist

In [73]:
# Changing the Diagnosis column
strokeData['Diagnosis'] = strokeData['Diagnosis'].map({'Stroke': 1, 'No Stroke': 0})
strokeData.head()

Unnamed: 0,Patient ID,Patient Name,Age,Gender,Hypertension,Heart Disease,Marital Status,Work Type,Residence Type,Average Glucose Level,...,Physical Activity,Stroke History,Family History of Stroke,Dietary Habits,Stress Levels,Cholesterol Levels,Symptoms,Diagnosis,Systolic_BP,Diastolic_BP
0,18153,Mamooty Khurana,56,Male,0,1,Married,Self-employed,Rural,130.91,...,Moderate,0,Yes,Vegan,3.48,"HDL: 68, LDL: 133","Difficulty Speaking, Headache",1,140,108
1,62749,Kaira Subramaniam,80,Male,0,0,Single,Self-employed,Urban,183.73,...,Low,0,No,Paleo,1.73,"HDL: 63, LDL: 70","Loss of Balance, Headache, Dizziness, Confusion",1,146,91
2,32145,Dhanush Balan,26,Male,1,1,Married,Never Worked,Rural,189.0,...,High,0,Yes,Paleo,7.31,"HDL: 59, LDL: 95","Seizures, Dizziness",1,154,97
3,6154,Ivana Baral,73,Male,0,0,Married,Never Worked,Urban,185.29,...,Moderate,0,No,Paleo,5.35,"HDL: 70, LDL: 137","Seizures, Blurred Vision, Severe Fatigue, Head...",0,174,81
4,48973,Darshit Jayaraman,51,Male,1,1,Divorced,Self-employed,Urban,177.34,...,Low,0,Yes,Pescatarian,6.84,"HDL: 65, LDL: 68",Difficulty Speaking,1,121,95


In [74]:
# Splitting Cholesterol Level Column into two columns, removing original
strokeData[['HDL_Cholesterol', 'LDL_Cholesterol']] = strokeData['Cholesterol Levels'].str.extract(r'HDL: (\d+), LDL: (\d+)')
strokeData = strokeData.drop(columns = ['Cholesterol Levels'])
strokeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12500 entries, 0 to 14999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                12500 non-null  int64  
 1   Patient Name              12500 non-null  object 
 2   Age                       12500 non-null  int64  
 3   Gender                    12500 non-null  object 
 4   Hypertension              12500 non-null  int64  
 5   Heart Disease             12500 non-null  int64  
 6   Marital Status            12500 non-null  object 
 7   Work Type                 12500 non-null  object 
 8   Residence Type            12500 non-null  object 
 9   Average Glucose Level     12500 non-null  float64
 10  Body Mass Index (BMI)     12500 non-null  float64
 11  Smoking Status            12500 non-null  object 
 12  Alcohol Intake            12500 non-null  object 
 13  Physical Activity         12500 non-null  object 
 14  Stroke Hist

In [75]:
# Handling Outliers
# Average Glucose level
glucose_outliers = strokeData[strokeData['Average Glucose Level'] > strokeData['Average Glucose Level'].quantile(0.99)]
strokeData = strokeData.drop(glucose_outliers.index)  

strokeData.info()


<class 'pandas.core.frame.DataFrame'>
Index: 12375 entries, 0 to 14999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient ID                12375 non-null  int64  
 1   Patient Name              12375 non-null  object 
 2   Age                       12375 non-null  int64  
 3   Gender                    12375 non-null  object 
 4   Hypertension              12375 non-null  int64  
 5   Heart Disease             12375 non-null  int64  
 6   Marital Status            12375 non-null  object 
 7   Work Type                 12375 non-null  object 
 8   Residence Type            12375 non-null  object 
 9   Average Glucose Level     12375 non-null  float64
 10  Body Mass Index (BMI)     12375 non-null  float64
 11  Smoking Status            12375 non-null  object 
 12  Alcohol Intake            12375 non-null  object 
 13  Physical Activity         12375 non-null  object 
 14  Stroke Hist

In [76]:
# Standardization/Normalization
# Age 
scaler = StandardScaler()
strokeData['Age'] = scaler.fit_transform(strokeData[['Age']]) 
strokeData.head()
                                                    

Unnamed: 0,Patient ID,Patient Name,Age,Gender,Hypertension,Heart Disease,Marital Status,Work Type,Residence Type,Average Glucose Level,...,Stroke History,Family History of Stroke,Dietary Habits,Stress Levels,Symptoms,Diagnosis,Systolic_BP,Diastolic_BP,HDL_Cholesterol,LDL_Cholesterol
0,18153,Mamooty Khurana,0.096744,Male,0,1,Married,Self-employed,Rural,130.91,...,0,Yes,Vegan,3.48,"Difficulty Speaking, Headache",1,140,108,68,133
1,62749,Kaira Subramaniam,1.238886,Male,0,0,Single,Self-employed,Urban,183.73,...,0,No,Paleo,1.73,"Loss of Balance, Headache, Dizziness, Confusion",1,146,91,63,70
2,32145,Dhanush Balan,-1.330935,Male,1,1,Married,Never Worked,Rural,189.0,...,0,Yes,Paleo,7.31,"Seizures, Dizziness",1,154,97,59,95
3,6154,Ivana Baral,0.905761,Male,0,0,Married,Never Worked,Urban,185.29,...,0,No,Paleo,5.35,"Seizures, Blurred Vision, Severe Fatigue, Head...",0,174,81,70,137
4,48973,Darshit Jayaraman,-0.141203,Male,1,1,Divorced,Self-employed,Urban,177.34,...,0,Yes,Pescatarian,6.84,Difficulty Speaking,1,121,95,65,68


In [77]:
# One hot encoding for marital status 
strokeData.head()

Unnamed: 0,Patient ID,Patient Name,Age,Gender,Hypertension,Heart Disease,Marital Status,Work Type,Residence Type,Average Glucose Level,...,Stroke History,Family History of Stroke,Dietary Habits,Stress Levels,Symptoms,Diagnosis,Systolic_BP,Diastolic_BP,HDL_Cholesterol,LDL_Cholesterol
0,18153,Mamooty Khurana,0.096744,Male,0,1,Married,Self-employed,Rural,130.91,...,0,Yes,Vegan,3.48,"Difficulty Speaking, Headache",1,140,108,68,133
1,62749,Kaira Subramaniam,1.238886,Male,0,0,Single,Self-employed,Urban,183.73,...,0,No,Paleo,1.73,"Loss of Balance, Headache, Dizziness, Confusion",1,146,91,63,70
2,32145,Dhanush Balan,-1.330935,Male,1,1,Married,Never Worked,Rural,189.0,...,0,Yes,Paleo,7.31,"Seizures, Dizziness",1,154,97,59,95
3,6154,Ivana Baral,0.905761,Male,0,0,Married,Never Worked,Urban,185.29,...,0,No,Paleo,5.35,"Seizures, Blurred Vision, Severe Fatigue, Head...",0,174,81,70,137
4,48973,Darshit Jayaraman,-0.141203,Male,1,1,Divorced,Self-employed,Urban,177.34,...,0,Yes,Pescatarian,6.84,Difficulty Speaking,1,121,95,65,68


In [78]:
# Exporting Dataset
strokeData.to_csv('cleaned_stroke_data.csv', index=False)


PermissionError: [Errno 13] Permission denied: 'cleaned_stroke_data.csv'