In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# Load the data from the CSV file
file_path = 'cleaning_data.csv'
data = pd.read_csv(file_path)

In [3]:
# Define features and target
X = data.drop(columns=['id', 'stroke'])  
y = data['stroke'] 



In [4]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:

# Define preprocessing for numerical and categorical features
numeric_features = ['age', 'avg_glucose_level', 'bmi']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [7]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=42))])

In [8]:

# Train the model
clf.fit(X_train, y_train)


In [9]:

# Predict on the test set
y_pred = clf.predict(X_test)


In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy*100:.2f}%")

Test set accuracy: 93.93%


In [11]:
if accuracy >= 0.75:
    print("Classification accuracy meets the 75% threshold!")
else:
    print("Classification accuracy does not meet the 75% threshold.")

Classification accuracy meets the 75% threshold!
