In [19]:
import pandas as pd, numpy as np
from scipy.stats import skew, kurtosis
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, roc_auc_score

# ---------------- Load Data ----------------
uci = pd.read_csv("diabetes_merged_date-time-sorted-includes-patient-id.csv")
pima = pd.read_csv("PIMAdiabetes.csv")

datasets = {"UCI": uci, "Pima": pima}

# ---------------- a) Univariate ----------------
def univariate(df, name):
    print(f"\n=== {name} Univariate ===")
    for col in df.select_dtypes(include=np.number).columns:
        print(f"\n{col}:")
        print("Mean:", df[col].mean())
        print("Median:", df[col].median())
        print("Mode:", df[col].mode()[0])
        print("Var:", df[col].var())
        print("Std:", df[col].std())
        print("Skew:", skew(df[col].dropna()))
        print("Kurtosis:", kurtosis(df[col].dropna()))

for name,df in datasets.items():
    univariate(df,name)

# ---------------- b) Bivariate ----------------
def bivariate(df, name):
    print(f"\n=== {name} Bivariate ===")
    
    # Detect columns dynamically
    cols = [c.lower() for c in df.columns]
    
    # Pima: Outcome is target
    if "outcome" in cols:
        lin_X, lin_y = "BMI", "Glucose"
        log_X, log_y = "Glucose", "Outcome"
    # UCI: target is "Y"
    elif "y" in cols:
        lin_X, lin_y = "bmi", "s5"   # example: bmi -> s5 (triglycerides proxy)
        log_X, log_y = "bmi", "Y"
    else:
        print("Could not detect target column!")
        return
    
    # Linear regression
    X,y=df[[lin_X]],df[lin_y]
    Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.2,random_state=0)
    lm=LinearRegression().fit(Xtr,ytr)
    print(f"Linear ({lin_X}->{lin_y}) R2:", r2_score(yte,lm.predict(Xte)))
    
    # Logistic regression (if target binary)
    if df[log_y].nunique() == 2:
        X,y=df[[log_X]],df[log_y]
        Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.2,random_state=0)
        log=LogisticRegression(max_iter=1000).fit(Xtr,ytr)
        print(f"Logistic ({log_X}->{log_y}) Acc:", accuracy_score(yte,log.predict(Xte)))
        print("AUC:", roc_auc_score(yte,log.predict_proba(Xte)[:,1]))
    else:
        print("Target not binary → skipping logistic regression.")

for name,df in datasets.items():
    bivariate(df,name)

# ---------------- c) Multiple Regression ----------------
def multiple(df,name):
    print(f"\n=== {name} Multiple Regression ===")
    cols = [c.lower() for c in df.columns]
    
    if "outcome" in cols:
        target = "Outcome"
    elif "y" in cols:
        target = "Y"
    else:
        print("Target not found.")
        return
    
    X,y=df.drop(columns=[target]),df[target]
    Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.2,random_state=0)
    
    if y.nunique()==2:  # classification
        log=LogisticRegression(max_iter=2000).fit(Xtr,ytr)
        print("Accuracy:",accuracy_score(yte,log.predict(Xte)))
        print("AUC:",roc_auc_score(yte,log.predict_proba(Xte)[:,1]))
    else:  # regression
        lm=LinearRegression().fit(Xtr,ytr)
        print("R2:", r2_score(yte,lm.predict(Xte)))
        print("RMSE:", mean_squared_error(yte,lm.predict(Xte),squared=False))

for name,df in datasets.items():
    multiple(df,name)

# ---------------- d) Comparison ----------------
print("\n=== Comparison Summary ===")
print("- UCI dataset uses regression (target Y is continuous).")
print("- Pima dataset is binary classification (Outcome 0/1).")
print("- Linear regression fits UCI better, Logistic regression fits Pima better.")



=== UCI Univariate ===

patient_id:
Mean: 36.51611459265891
Median: 34.0
Mode: 55
Var: 404.79125919807706
Std: 20.119424922151158
Skew: -0.0584245700012577
Kurtosis: -1.1086049214631442

code:
Mean: 46.37387232284278
Median: 48.0
Mode: 33
Var: 177.53302208326696
Std: 13.324151833541487
Skew: 0.16126658251045617
Kurtosis: -1.792406954081909

value:
Mean: 0.5031678259073067
Median: 0.0
Mode: 0
Var: 0.6372441913161795
Std: 0.7982757614484981
Skew: 1.5450097815876316
Kurtosis: 1.733812011563705

=== Pima Univariate ===

Pregnancies:
Mean: 3.8450520833333335
Median: 3.0
Mode: 1
Var: 11.35405632062142
Std: 3.3695780626988623
Skew: 0.8999119408414357
Kurtosis: 0.15038273760948462

Glucose:
Mean: 120.89453125
Median: 117.0
Mode: 99
Var: 1022.2483142519557
Std: 31.97261819513622
Skew: 0.17341395519987735
Kurtosis: 0.6288133337300685

BloodPressure:
Mean: 69.10546875
Median: 72.0
Mode: 70
Var: 374.6472712271838
Std: 19.355807170644777
Skew: -1.8400052311728738
Kurtosis: 5.138690662525438

SkinT