In [4]:

# Cell 1: Mount Google Drive and load the dataset
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/fl1/diabetes.csv'
import pandas as pd
df = pd.read_csv(data_path)
df.head()

# Cell 2: Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
import seaborn as sns

# Cell 3: Preprocess categorical columns
# Convert categorical columns to numerical
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
binary_columns = ['Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia',
                  'Genital thrush', 'visual blurring', 'Itching', 'Irritability',
                  'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity']
for col in binary_columns:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Convert target variable 'class' to numerical
df['class'] = df['class'].map({'Positive': 1, 'Negative': 0})

# Check for missing or infinite values
if df.isnull().sum().sum() > 0 or np.isinf(df).sum().sum() > 0:
    print("Warning: Dataset contains missing or infinite values. Handling them...")
    df = df.fillna(df.mean())  # Simple imputation for numerical columns
    df = df.replace([np.inf, -np.inf], np.nan).dropna()

df.head()

# Cell 4: Define feature and target variables
X = df.drop(columns='class').values  # Features
Y = df['class'].values  # Target
feature_names = df.drop(columns='class').columns

# Standardize numerical features (e.g., Age)
scaler = StandardScaler()
X[:, 0] = scaler.fit_transform(X[:, 0].reshape(-1, 1)).flatten()  # Standardize Age column

# Cell 5: Feature Selection Functions
def select_features(X, y, method="anova", k=10, feature_names=None):
    if method == "chi2":
        # Ensure non-negative values for chi2
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)
        selector = SelectKBest(score_func=chi2, k=k)
        X_new = selector.fit_transform(X_scaled, y)
        indices = selector.get_support(indices=True)
    elif method == "anova":
        selector = SelectKBest(score_func=f_classif, k=k)
        X_new = selector.fit_transform(X, y)
        indices = selector.get_support(indices=True)
    elif method == "correlation":
        # Correlation-based feature selection
        df_temp = pd.DataFrame(X, columns=feature_names)
        df_temp['target'] = y
        # Compute absolute correlation with target
        corr_with_target = df_temp.corr()['target'].abs().drop('target')
        # Sort by correlation and select top k
        top_features = corr_with_target.sort_values(ascending=False).head(k).index
        # Check for multicollinearity (remove one of highly correlated pairs)
        corr_matrix = df_temp[top_features].corr().abs()
        to_drop = set()
        for i in range(len(top_features)):
            for j in range(i + 1, len(top_features)):
                if corr_matrix.iloc[i, j] > 0.7:  # Threshold for multicollinearity
                    # Drop the feature with lower correlation to target
                    if corr_with_target[top_features[i]] > corr_with_target[top_features[j]]:
                        to_drop.add(top_features[j])
                    else:
                        to_drop.add(top_features[i])
        selected_features = [f for f in top_features if f not in to_drop][:k]
        indices = [list(feature_names).index(f) for f in selected_features]
        X_new = X[:, indices]
    else:
        raise ValueError("Unsupported FS method")
    return X_new, indices

# Cell 6: Feature Selection with Scores
def get_feature_scores(X, y, method="anova", k=10, feature_names=None):
    if method == "chi2":
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)
        selector = SelectKBest(score_func=chi2, k=k)
        selector.fit(X_scaled, y)
        scores = selector.scores_
        indices = selector.get_support(indices=True)
    elif method == "anova":
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X, y)
        scores = selector.scores_
        indices = selector.get_support(indices=True)
    elif method == "correlation":
        df_temp = pd.DataFrame(X, columns=feature_names)
        df_temp['target'] = y
        corr_with_target = df_temp.corr()['target'].abs().drop('target')
        top_features = corr_with_target.sort_values(ascending=False).head(k).index
        corr_matrix = df_temp[top_features].corr().abs()
        to_drop = set()
        for i in range(len(top_features)):
            for j in range(i + 1, len(top_features)):
                if corr_matrix.iloc[i, j] > 0.7:
                    if corr_with_target[top_features[i]] > corr_with_target[top_features[j]]:
                        to_drop.add(top_features[j])
                    else:
                        to_drop.add(top_features[i])
        selected_features = [f for f in top_features if f not in to_drop][:k]
        indices = [list(feature_names).index(f) for f in selected_features]
        scores = [corr_with_target[f] for f in selected_features]
    else:
        raise ValueError("Unsupported method")
    return indices, scores

# Get top 10 features for ANOVA, Chi-squared, and Correlation
k = 10
anova_indices, anova_scores = get_feature_scores(X, Y, method="anova", k=k, feature_names=feature_names)
chi2_indices, chi2_scores = get_feature_scores(X, Y, method="chi2", k=k, feature_names=feature_names)
corr_indices, corr_scores = get_feature_scores(X, Y, method="correlation", k=k, feature_names=feature_names)

# Get selected feature names and scores
anova_features = [(feature_names[i], score) for i, score in zip(anova_indices, anova_scores)]
chi2_features = [(feature_names[i], score) for i, score in zip(chi2_indices, chi2_scores)]
corr_features = [(feature_names[i], score) for i, score in zip(corr_indices, corr_scores)]

# Print selected features with scores
print("\nTop 10 Features - ANOVA:")
for feature, score in anova_features:
    print(f"{feature}: {score:.4f}")

print("\nTop 10 Features - Chi-squared:")
for feature, score in chi2_features:
    print(f"{feature}: {score:.4f}")

print("\nTop 10 Features - Correlation:")
for feature, score in corr_features:
    print(f"{feature}: {score:.4f}")


# Cell 7: Feature Extraction Function
def extract_features(X, y, method="lda", n_components=1):
    if method == "pca":
        extractor = PCA(n_components=n_components)
        X_new = extractor.fit_transform(X)
    elif method == "lda":
        n_components = min(n_components, len(np.unique(y))-1)  # Ensure valid n_components for LDA
        extractor = LDA(n_components=n_components)
        X_new = extractor.fit_transform(X, y)
    else:
        raise ValueError("Unsupported FE method")
    return X_new

# Cell 8: MLP Model Definition
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 100)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Cell 9: Training Function
def train(model, loader, criterion, optimizer, epochs=20):
    model.train()
    for _ in range(epochs):
        for X_batch, y_batch in loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

# Cell 10: Model Aggregation Function
def average_models(models):
    new_model = MLP(models[0].fc1.in_features)
    new_state_dict = new_model.state_dict()
    for key in new_state_dict:
        new_state_dict[key] = torch.stack([m.state_dict()[key] for m in models], 0).mean(0)
    new_model.load_state_dict(new_state_dict)
    return new_model

# Cell 11: Evaluation Function
def evaluate(model, loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            output = model(X_batch)
            preds = torch.argmax(output, dim=1)
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0)
    }

# Cell 12: Federated Learning Pipeline
fs_methods = ["chi2", "anova", "correlation"]
fe_methods = ["pca", "lda"]
k_features = 10
results = []

for fs_method in fs_methods:
    for fe_method in fe_methods:
        print(f"\nRunning FS: {fs_method.upper()} + FE: {fe_method.upper()}")
        try:
            # Split data into 3 clients
            client_splits = np.array_split(np.random.permutation(len(X)), 3)
            client_datasets = []

            for split in client_splits:
                X_client = X[split]
                Y_client = Y[split]
                X_fs, _ = select_features(X_client, Y_client, method=fs_method, k=k_features, feature_names=feature_names)
                n_components = 5 if fe_method == "pca" else 1  # Adjust based on method
                X_fe = extract_features(X_fs, Y_client, method=fe_method, n_components=n_components)
                X_train, X_val, y_train, y_val = train_test_split(X_fe, Y_client, test_size=0.2, random_state=42)
                train_loader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                                        torch.tensor(y_train, dtype=torch.long)), batch_size=16, shuffle=True)
                val_loader = DataLoader(TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                                      torch.tensor(y_val, dtype=torch.long)), batch_size=64, shuffle=False)
                client_datasets.append((train_loader, val_loader))

            # Train local models
            local_models = []
            for train_loader, _ in client_datasets:
                model = MLP(n_components)  # Use n_components as input_dim
                optimizer = optim.SGD(model.parameters(), lr=0.01)
                criterion = nn.CrossEntropyLoss()
                train(model, train_loader, criterion, optimizer)
                local_models.append(model)

            # Aggregate global model
            global_model = average_models(local_models)

            # Evaluate on combined validation data
            combined_X, combined_y = [], []
            for _, val_loader in client_datasets:
                for X_batch, y_batch in val_loader:
                    combined_X.append(X_batch)
                    combined_y.append(y_batch)
            X_all = torch.cat(combined_X)
            y_all = torch.cat(combined_y)
            val_loader = DataLoader(TensorDataset(X_all, y_all), batch_size=64)
            metrics = evaluate(global_model, val_loader)
            results.append({
                "FS": fs_method.upper(),
                "FE": fe_method.upper(),
                "Accuracy": round(metrics['accuracy'], 4),
                "Precision": round(metrics['precision'], 4),
                "Recall": round(metrics['recall'], 4),
                "F1 Score": round(metrics['f1'], 4),
            })
            print(f"Completed FS: {fs_method.upper()}, FE: {fe_method.upper()}")
        except Exception as e:
            print(f"Error for FS: {fs_method.upper()}, FE: {fe_method.upper()}: {str(e)}")

# Display results
print("\nSummary Results Table:")
df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Top 10 Features - ANOVA:
Gender: 5.2086
Polyuria: 130.9688
Polydipsia: 412.7384
sudden weight loss: 376.4226
weakness: 121.9737
Polyphagia: 32.5851
visual blurring: 68.8418
Irritability: 6.3782
partial paresis: 34.9178
Alopecia: 0.0928

Top 10 Features - Chi-squared:
Gender: 0.2110
Polyuria: 38.7476
Polydipsia: 116.1846
sudden weight loss: 120.7855
weakness: 57.7493
Polyphagia: 12.7243
visual blurring: 33.1984
Irritability: 4.9140
partial paresis: 18.1246
Alopecia: 0.0478

Top 10 Features - Correlation:
Polyuria: 0.6659
Polydipsia: 0.6487
Gender: 0.4492
sudden weight loss: 0.4366
partial paresis: 0.4323
Polyphagia: 0.3425
Irritability: 0.2995
Alopecia: 0.2675
visual blurring: 0.2513
weakness: 0.2433

Running FS: CHI2 + FE: PCA
Completed FS: CHI2, FE: PCA

Running FS: CHI2 + FE: LDA
Completed FS: CHI2, FE: LDA

Running FS: ANOVA + FE: PCA
Completed FS: ANOVA,