In [15]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, make_scorer, recall_score
import time

data = pd.read_csv("Week6_7/diabetes_data.csv")

def create_target_label(row):
    if row['DiabeticClass'] == 'Positive' and row['Obesity'] == 'Yes':
        return 'Both'
    elif row['DiabeticClass'] == 'Positive' and row['Obesity'] == 'No':
        return 'OnlyDiabetes'
    elif row['DiabeticClass'] == 'Negative' and row['Obesity'] == 'Yes':
        return 'OnlyObesity'
    else:
        return 'Healthy'

data['HealthStatus'] = data.apply(create_target_label, axis=1)

features = ['Age', 'Gender', 'ExcessUrination', 'Polydipsia', 'WeightLossSudden', 
           'Fatigue', 'Polyphagia', 'GenitalThrush', 'BlurredVision', 'Itching', 
           'Irritability', 'DelayHealing', 'PartialPsoriasis', 'MuscleStiffness', 'Alopecia']

data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

for col in features:
    if data[col].dtype == 'object':
        data[col] = data[col].map({'Yes': 1, 'No': 0})

x = data[features]
y = data['HealthStatus']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# model = RandomForestClassifier(n_estimators=200, 
#                              max_depth=10,
#                              random_state=2024)
# model.fit(x_train, y_train)

# params = {
#     "n_estimators": [50, 100, 200, 500],
#     "criterion": ["gini", "entropy", "log_loss"],
#     "max_depth": [None, 2, 5, 10]
# }

# model = GridSearchCV(RandomForestClassifier(random_state=2024), param_grid=params, scoring="accuracy", cv=6, verbose=1, n_jobs=-1)
# model.fit(x_train, y_train)

# print("Best score: {}".format(model.best_score_))
# print("Best param: {}".format(model.best_params_))

# y_predict = model.predict(x_test)
# print(classification_report(y_test, y_predict))

# Define parameters for both models
rf_params = {
    "n_estimators": [50, 100, 200, 500],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 2, 5, 10]
}

svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

# Function to train and evaluate model
def train_and_evaluate(model, params, model_name):
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    
    # Grid search
    grid_search = GridSearchCV(
        model, 
        param_grid=params, 
        scoring="accuracy",
        cv=6, 
        verbose=0, 
        n_jobs=4
    )
    
    # Fit model
    grid_search.fit(x_train, y_train)
    
    # Training time
    train_time = time.time() - start_time
    
    # Make predictions
    y_pred = grid_search.predict(x_test)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return grid_search, y_pred, train_time

# Train and evaluate both models
rf_model = RandomForestClassifier(random_state=2024)
svm_model = SVC(random_state=2024, probability=True)

rf_results = train_and_evaluate(rf_model, rf_params, "Random Forest")
svm_results = train_and_evaluate(svm_model, svm_params, "Support Vector Machine")




Training Random Forest...

Random Forest Results:
Best parameters: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}
Best score: 0.9135
Training time: 8.60 seconds

Classification Report:
              precision    recall  f1-score   support

        Both       0.73      0.85      0.79        13
     Healthy       0.90      0.97      0.93        29
OnlyDiabetes       0.96      0.87      0.91        55
 OnlyObesity       0.88      1.00      0.93         7

    accuracy                           0.90       104
   macro avg       0.87      0.92      0.89       104
weighted avg       0.91      0.90      0.90       104


Training Support Vector Machine...

Support Vector Machine Results:
Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best score: 0.8919
Training time: 3.29 seconds

Classification Report:
              precision    recall  f1-score   support

        Both       0.65      0.85      0.73        13
     Healthy       0.97      0.97      0.97        29
Only