In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load the data
file_path = 'cleaning_data.csv'
data = pd.read_csv(file_path)

# Split features and target
X = data.drop(columns=['stroke'])
y = data['stroke']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a function to create and evaluate the model
def evaluate_model(preprocessor, model):
    # Create a pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Predict on test set
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Initialize an empty DataFrame to log changes and performance
log_data = pd.DataFrame(columns=['Changes Made', 'Accuracy'])

# List of different preprocessing strategies
strategies = ['mean', 'median', 'most_frequent']

for strategy in strategies:
    # Preprocessing for numeric columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=strategy)),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical columns
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=strategy)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Column transformation for numeric and categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns),
            ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
        ])

    # Evaluate the model using this strategy
    accuracy = evaluate_model(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))

    # Log changes and accuracy
    log_data = log_data.append({'Changes Made': f'Strategy: {strategy}', 'Accuracy': accuracy}, ignore_index=True)

# Save log_data to CSV
log_data.to_csv('model_performance.csv', index=False)


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Male'