In [3]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, make_scorer, recall_score
import time

data = pd.read_csv("Week6_7/stroke_classification.csv")
data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})

target = "stroke"
redundant = "pat_id"
x = data.drop([target, redundant], axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder()),
])
					 		
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ['age', 'avg_glucose_level', 'bmi']),
    ("nom_features", nom_transformer, ['gender', 'hypertension', 'heart_disease', 'work_related_stress', 'urban_residence', 'smokes']),
])

# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier(random_state=2024))
# ])

# params = {
#     "classifier__n_estimators": [50, 100, 200, 500],
#     "classifier__criterion": ["gini", "entropy", "log_loss"],
#     "classifier__max_depth": [None, 2, 5, 10]
# }

# model = GridSearchCV(pipeline, param_grid=params, scoring="recall", cv=6, verbose=1, n_jobs=4)
# model.fit(x_train, y_train)

# print("Best score: {}".format(model.best_score_))
# print("Best param: {}".format(model.best_params_))

# y_predict = model.predict(x_test)
# print(classification_report(y_test, y_predict))

# Define parameters for both models
rf_params = {
    "classifier__n_estimators": [50, 100, 200, 500],
    "classifier__criterion": ["gini", "entropy", "log_loss"],
    "classifier__max_depth": [None, 2, 5, 10]
}

svm_params = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.1, 0.01],
    'classifier__kernel': ['rbf', 'linear']
}

# Function to train and evaluate model
def train_and_evaluate(model, params, model_name):
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    
    # Grid search
    grid_search = GridSearchCV(
        model, 
        param_grid=params, 
        scoring="recall",
        cv=6, 
        verbose=0, 
        n_jobs=4
    )
    
    # Fit model
    grid_search.fit(x_train, y_train)
    
    # Training time
    train_time = time.time() - start_time
    
    # Make predictions
    y_pred = grid_search.predict(x_test)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return grid_search, y_pred, train_time

# Train and evaluate both models
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=2024))
])
svm_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=2024, probability=True))
])

rf_results = train_and_evaluate(rf_model, rf_params, "Random Forest")
svm_results = train_and_evaluate(svm_model, svm_params, "Support Vector Machine")


Training Random Forest...

Random Forest Results:
Best parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__n_estimators': 100}
Best score: 0.0152
Training time: 20.86 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.20      0.02      0.04        50

    accuracy                           0.95      1022
   macro avg       0.58      0.51      0.50      1022
weighted avg       0.92      0.95      0.93      1022


Training Support Vector Machine...

Support Vector Machine Results:
Best parameters: {'classifier__C': 100, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
Best score: 0.0101
Training time: 614.57 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       972
           1       0.12      0.02      0.03        50

    accuracy                