In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
# getting the dataset
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# getting the amount of data
dataset.shape

(5110, 12)

In [5]:
# trying to see which columns have null values
# as observed only bmi has null values, and there are only 201. Hence we can just remove these 201 columns.
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
# Removing the columns which have null values
dataset = dataset.dropna().reset_index(drop=True)

In [7]:
# We don't need the ids as they are of no use to us
dataset = dataset.drop(columns=['id'])

In [8]:
# Getting all the unique values of the the smoking status
dataset['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [9]:
# removed all the columnes which had smoking status as 'Unknown'. Did not know how to create another model to get those values
dataset = dataset[(dataset.smoking_status != 'Unknown')].reset_index(drop=True)

In [10]:
# removing the columns that have gender as 'Other'
dataset = dataset[(dataset.gender != 'Other')].reset_index(drop=True)

In [11]:
dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [12]:
# Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=['stroke']), dataset['stroke'], test_size = 0.2)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2740, 10)
(685, 10)
(2740,)
(685,)


COLUMN Transformer

In [19]:
from sklearn.compose import ColumnTransformer

In [20]:
# very effective method to perform all the transformating in one go. I have used the 'Column Transformer'.
# In the dataset, smoking_status has been tranformed using the Ordinal Encoder
# gender, ever_marrind, work_type and residence_type has been transformed using the one hot encoder
transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['never smoked', 'formerly smoked', 'smokes']]),['smoking_status']),
    ('tnf2', OneHotEncoder(sparse=False, drop='first'),['gender', 'ever_married', 'work_type', 'Residence_type'])
], remainder='passthrough')

In [21]:
# transforming X_train
X_train = transformer.fit_transform(X_train)



In [22]:
# transforming X_test
X_test = transformer.transform(X_test)

In [23]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2740, 13)
(685, 13)
(2740,)
(685,)


In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(X_train)

In [26]:
X_train = scaler.transform(X_train)

In [28]:
scaler.fit(X_test)
X_test= scaler.transform(X_test)
print(X_test)

[[-0.84110982  1.30671722  0.5812836  ... -0.24902912 -0.15810742
   1.34562885]
 [ 1.61586343 -0.76527652  0.5812836  ... -0.24902912  0.91701821
  -0.44416204]
 [-0.84110982  1.30671722  0.5812836  ... -0.24902912 -0.20750835
  -0.41724789]
 ...
 [ 0.3873768  -0.76527652  0.5812836  ... -0.24902912  1.38602462
  -0.32304837]
 [-0.84110982 -0.76527652 -1.72033066 ... -0.24902912  2.48675808
  -1.02281623]
 [-0.84110982 -0.76527652 -1.72033066 ... -0.24902912 -0.12443413
   0.69968929]]


In [29]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2740, 13)
(685, 13)
(2740,)
(685,)


In [30]:
from imblearn.over_sampling import SMOTE

# Create a SMOTE object
smote = SMOTE(random_state=42)

# Fit the SMOTE model on the training data
X_train, y_train = smote.fit_resample(X_train, y_train)

In [31]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5182, 13)
(685, 13)
(5182,)
(685,)


In [32]:
# creating the SVC classifier
classifier = svm.SVC(kernel='linear')

In [33]:
# providing the data to the classifier
classifier.fit(X_train, y_train)

In [34]:
# getting the accuracy score
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [35]:
print(testing_data_accuracy)

0.7065693430656934


In [36]:
dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [37]:
# Define a function to get user input and make predictions
def get_user_input():
    gender = input("Enter your gender: ")
    age = int(input("Enter your age: "))
    hypertension = int(input("Do you have hypertension (0 if NO and 1 if YES): "))
    heart_disease = int(input("Enter your heart disease (0 if NO and 1 if YES): "))
    ever_married = input("Were you married: ")
    work_type = input("Enter your work type: ")
    Residence_type = input("Enter your residence type: ")
    avg_glucose_level = float(input("Enter your average glucose level: "))
    bmi = float(input("Enter your bmi: "))
    smoking_status = input("Enter your smoking status: ")

    # Create a list with user input
    input_data = [gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status]

    return input_data

# Define a function to preprocess the input and make predictions
def predict(input_data):
    # Assuming you have a transformer and scaler defined
    # transformer = ...
    # scaler = ...

    # Assuming you have a trained classifier
    # classifier = ...

    # Convert the list to a pandas DataFrame with named columns
    columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
    input_df = pd.DataFrame([input_data], columns=columns)

    # Preprocess input data
    input_df_transformed = transformer.transform(input_df)
    input_df_scaled = scaler.transform(input_df_transformed)

    # Make prediction
    prediction = classifier.predict(input_df_scaled)

    print(prediction)

# Get user input
user_input = get_user_input()

# Make prediction
predict(user_input)


ValueError: invalid literal for int() with base 10: ''