In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from statsmodels.stats.weightstats import ztest
df = pd.read_csv("/content/diabetes_data_upload.csv")
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
X = df.drop(columns=["class"])
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier()
}
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    type_i_error = fp / (fp + tn)
    type_ii_error = fn / (fn + tp)

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "Type I Error": type_i_error,
        "Type II Error": type_ii_error
    }
results_df = pd.DataFrame(results).T
print(results_df)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
misclassified_indices = (y_test != y_pred)
correctly_classified_ages = X_test.loc[~misclassified_indices, "Age"]
misclassified_ages = X_test.loc[misclassified_indices, "Age"]
z_stat, p_value = ztest(correctly_classified_ages, misclassified_ages)
print(f"Z-Test for Mean Age Difference: Z-Statistic={z_stat}, P-Value={p_value}")
rf_fp_rate = results["Random Forest"]["Type I Error"]
if rf_fp_rate > 0.20:
    z_stat, p_value = ztest([rf_fp_rate], value=0.20)
    print(f"Z-Test for Type I Error in Random Forest: Z-Statistic={z_stat}, P-Value={p_value}")
z_stat, p_value = ztest([results["SVM"]["Type II Error"]], [results["KNN"]["Type II Error"]])
print(f"Z-Test for Type II Error between SVM and KNN: Z-Statistic={z_stat}, P-Value={p_value}")


                     Accuracy  Precision    Recall  Type I Error  \
Logistic Regression  0.923077   0.931507  0.957746      0.151515   
Decision Tree        0.961538   1.000000  0.943662      0.000000   
Random Forest        1.000000   1.000000  1.000000      0.000000   
Gradient Boosting    0.971154   1.000000  0.957746      0.000000   
SVM                  0.682692   0.682692  1.000000      1.000000   
KNN                  0.855769   0.951613  0.830986      0.090909   

                     Type II Error  
Logistic Regression       0.042254  
Decision Tree             0.056338  
Random Forest             0.000000  
Gradient Boosting         0.042254  
SVM                       0.000000  
KNN                       0.169014  
Z-Test for Mean Age Difference: Z-Statistic=1.7686101020183942, P-Value=0.07695896375275615
Z-Test for Type II Error between SVM and KNN: Z-Statistic=nan, P-Value=nan


  var /= nobs1 + nobs2 - 2 * ddof
