In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier 


# Load the data
file_path = 'cleaning_data.csv'
data = pd.read_csv(file_path)




       



In [2]:
# Split features and target
X = data.drop(columns=['stroke'])
y = data['stroke']





In [3]:
# List of numeric and categorical columns
numeric_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
categorical_columns = ['gender']  

# Initialize an empty DataFrame to log changes and performance
log_data = pd.DataFrame(columns=['Changes Made', 'Accuracy'])

# Loop through different strategies for handling missing values
strategies = ['most_frequent']

for strategy in strategies:
    # Preprocessing for numeric columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical columns
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=strategy)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Column transformation for numeric and categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    # Pipeline with preprocessing and model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

  

In [4]:
  # Train the model
model.fit(X, y)

   

In [5]:
 # Predict on the same data for simplicity (you might use cross-validation for evaluation)
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

    

In [6]:
# Log changes and accuracy
log_data = log_data.append({'Changes Made': f'Strategy: {strategy}', 'Accuracy': accuracy}, ignore_index=True)



  log_data = log_data.append({'Changes Made': f'Strategy: {strategy}', 'Accuracy': accuracy}, ignore_index=True)


In [7]:
# Save log_data to CSV
log_data.to_csv('model_evaluations.csv', index=False)


In [8]:
# Create a new model
model_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=126)) 
])


In [9]:
# Train the new model
model_2.fit(X, y)

In [10]:
# Predict using the new model
y_pred_new = model_2.predict(X)
accuracy_new = accuracy_score(y, y_pred_new)

In [11]:
# Log changes and accuracy for the new model
log_data = log_data.append({'Changes Made': 'model_2', 'Accuracy': accuracy_new}, ignore_index=True)

  log_data = log_data.append({'Changes Made': 'model_2', 'Accuracy': accuracy_new}, ignore_index=True)


In [12]:
# Save log_data to CSV
log_data.to_csv('model_2_evaluations.csv', index=False)

In [13]:
#  log_data contains the results for the first model
log_data = pd.DataFrame(columns=['Changes Made', 'Accuracy'])
log_data = log_data.append({'Changes Made': 'Strategy: most_frequent', 'Accuracy': 1.0}, ignore_index=True)

# add the second model results
log_data = log_data.append({'Changes Made': 'model_2', 'Accuracy': 0.9569471624266145}, ignore_index=True)

# Save log_data to CSV with both models' performance
log_data.to_csv('compare_model_evaluations.csv', index=False)


  log_data = log_data.append({'Changes Made': 'Strategy: most_frequent', 'Accuracy': 1.0}, ignore_index=True)
  log_data = log_data.append({'Changes Made': 'model_2', 'Accuracy': 0.9569471624266145}, ignore_index=True)
