In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("brain_stroke.csv")

In [3]:
df.shape

(4981, 11)

In [4]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [6]:
df.isnull().sum()

Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0
smoking_status,0


In [7]:
df.duplicated().sum()

0

In [8]:
df['hypertension'].unique()

array([0, 1])

In [9]:
df['heart_disease'].unique()

array([1, 0])

In [10]:
cat_col = df.select_dtypes(object).columns
cat_col

Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')

In [13]:
for col in cat_col:
    print(col, df[col].unique())

gender ['Male' 'Female']
ever_married ['Yes' 'No']
work_type ['Private' 'Self-employed' 'Govt_job' 'children']
Residence_type ['Urban' 'Rural']
smoking_status ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   int64  
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   int64  
 5   work_type          4981 non-null   int64  
 6   Residence_type     4981 non-null   int64  
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   int64  
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 428.2 KB


In [16]:
df['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4733
1,248


In [23]:
from imblearn.over_sampling import SMOTE

In [33]:
smote = SMOTE(random_state=4)

In [39]:
X, y = smote.fit_resample(df.drop('stroke', axis=1), df['stroke'])

In [40]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.000000,0,1,1,1,1,228.690000,36.600000,1
1,1,80.000000,0,1,1,1,0,105.920000,32.500000,2
2,0,49.000000,0,0,1,1,1,171.230000,34.400000,3
3,0,79.000000,1,0,1,2,0,174.120000,24.000000,2
4,1,81.000000,0,0,1,1,1,186.210000,29.000000,1
...,...,...,...,...,...,...,...,...,...,...
9461,1,58.636296,0,0,1,1,0,72.241404,29.427334,0
9462,1,58.982718,0,0,0,1,0,92.276049,33.916299,1
9463,0,75.903171,0,0,1,1,0,199.489015,27.232219,0
9464,1,80.287395,0,0,1,1,0,104.026067,32.844874,2


In [41]:
y.value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
1,4733
0,4733


In [42]:
temp_df = pd.concat([X, y], axis=1)

In [44]:
temp_df = temp_df[temp_df['stroke'] == 1]

In [45]:
temp_df.shape

(4733, 11)

In [50]:
(len(df) * 60) / 100

2988.6

In [52]:
temp_df = temp_df.sample(n=2900)

In [53]:
temp_df['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
1,2900


In [54]:
final_df = pd.concat([df, temp_df], axis=0)

In [55]:
final_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.000000,0,1,1,1,1,228.690000,36.600000,1,1
1,1,80.000000,0,1,1,1,0,105.920000,32.500000,2,1
2,0,49.000000,0,0,1,1,1,171.230000,34.400000,3,1
3,0,79.000000,1,0,1,2,0,174.120000,24.000000,2,1
4,1,81.000000,0,0,1,1,1,186.210000,29.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...
9135,0,50.728965,0,0,1,0,0,103.085786,28.411242,1,1
6689,0,41.350049,0,0,1,1,0,83.373169,25.594985,0,1
5820,0,79.489115,0,0,1,1,0,79.975777,20.425398,1,1
8915,0,56.235656,0,0,1,1,0,188.023792,39.009631,1,1


In [56]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7881 entries, 0 to 5216
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             7881 non-null   int64  
 1   age                7881 non-null   float64
 2   hypertension       7881 non-null   int64  
 3   heart_disease      7881 non-null   int64  
 4   ever_married       7881 non-null   int64  
 5   work_type          7881 non-null   int64  
 6   Residence_type     7881 non-null   int64  
 7   avg_glucose_level  7881 non-null   float64
 8   bmi                7881 non-null   float64
 9   smoking_status     7881 non-null   int64  
 10  stroke             7881 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 738.8 KB


In [57]:
final_df['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4733
1,3148


In [58]:
X, y = final_df.drop('stroke', axis=1), final_df['stroke']

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [60]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [61]:
from sklearn.ensemble import AdaBoostClassifier

In [62]:
model = AdaBoostClassifier()

In [64]:
model.fit(X_train, y_train)

In [65]:
y_pred = model.predict(X_test)

In [66]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8294229549778059


In [68]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R'],
}

In [69]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

In [70]:
grid.fit(X_train, y_train)

In [71]:
grid.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 400}

In [72]:
grid.best_score_

0.8351824578628703

In [73]:
y_pred_grid = grid.predict(X_test)

In [75]:
print(accuracy_score(y_test, y_pred_grid))

0.8325935320228282
