# Goal:  PII De-identification Engine with Gradient Boosting Classifier

In [4]:

# Recommended package versions for compatibility
!pip install -q scikit-learn==1.3.2 scipy==1.11.4 numpy==1.24.3

# Install required packages
!pip install -q -U  faker python-dateutil
print("Installation Complete")


Installation Complete


## Load Libraries

In [5]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import random

## Core function

In [6]:
class FederatedPIIGradientBoost:
    def __init__(self):
        self.fake = Faker()

    def create_data(self, n=500):
        data = []
        for _ in range(n):
            name = self.fake.name()
            age = random.randint(18, 85)  # Extended range to make age > 80 possible
            dob = (datetime.now() - timedelta(days=age*365)).strftime('%Y-%m-%d')
            phone = self.fake.phone_number()
            risk_score = age * 1.2 + len(name) * 0.5 + (20 if age > 80 else 0) + random.uniform(10, 40)
            
            data.append({
                'name': name,
                'dob': dob,
                'phone': phone,
                'risk_score': risk_score
            })
        
        return pd.DataFrame(data)

    def deidentify(self, df):
        """Improved de-identification with better privacy protection"""
        name_map = {name: self.fake.name() for name in df['name'].unique()}
        
        def shift_date(date_str):
            # Random shift between 30-200 days for better privacy
            shift_days = random.randint(30, 200)
            date_obj = datetime.strptime(date_str, '%Y-%m-%d')
            return (date_obj + timedelta(days=shift_days)).strftime('%Y-%m-%d')
        
        return df.assign(
            name=df['name'].map(name_map),
            dob=df['dob'].apply(shift_date),
            phone=df['phone'].apply(lambda x: self.fake.phone_number())
        )

    def extract_features(self, df):
        """Extract features for machine learning"""
        return np.column_stack([
            df['name'].str.len(),
            df['name'].str.split().str.len(),
            2024 - pd.to_datetime(df['dob']).dt.year,
            df['phone'].str.count(r'\d'),  # Fixed regex pattern
            df['phone'].str.len()
        ])

    def federated_train(self, num_clients=5, rounds=3):
        print(" FEDERATED LEARNING - GRADIENT BOOSTING")
        print("-" * 45)

        # Create client datasets
        client_datasets = [self.create_data(2000) for _ in range(num_clients)]
        print(f"Created {num_clients} client datasets, each with 2000 records.")

        # Initialize global model parameters
        global_model_params = None
        test_accuracies = []

        for round_num in range(1, rounds + 1):
            local_models = []
            print(f"\n--- Round {round_num} ---")

            for client_id, client_data in enumerate(client_datasets):
                # Split data for training and local testing
                train_size = int(0.8 * len(client_data))
                train_data = client_data.iloc[:train_size].copy()
                test_data = client_data.iloc[train_size:].copy()
                
                # De-identify data locally (PII never leaves client)
                deidentified_train = self.deidentify(train_data)
                
                # Extract features from de-identified data for consistency
                X_train = self.extract_features(deidentified_train)
                y_train = (deidentified_train['risk_score'] > 
                          deidentified_train['risk_score'].median()).astype(int)
                
                # Scale features
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                
                # Train local model
                local_model = GradientBoostingClassifier(
                    n_estimators=100, 
                    max_depth=4, 
                    learning_rate=0.1,  # More reasonable learning rate
                    random_state=42
                )
                local_model.fit(X_train_scaled, y_train)
                
                # Test local model
                deidentified_test = self.deidentify(test_data)
                X_test = self.extract_features(deidentified_test)
                y_test = (deidentified_test['risk_score'] > 
                         deidentified_test['risk_score'].median()).astype(int)
                X_test_scaled = scaler.transform(X_test)
                
                local_accuracy = local_model.score(X_test_scaled, y_test)
                
                # Store model for aggregation (in real, only parameters would be sent)
                local_models.append(local_model)
                
                print(f"  Client {client_id+1}: Local accuracy: {local_accuracy:.3f}")

            # Aggregate models (simplified - in practice would aggregate actual parameters)
            if local_models:
                # Average feature importances as a simple aggregation method
                avg_feature_importances = np.mean([model.feature_importances_ 
                                                 for model in local_models], axis=0)
                
                # Store global model info
                global_model_params = {
                    'feature_importances': avg_feature_importances,
                    'round': round_num
                }
            
            print(f"  Server: Aggregated parameters from {len(local_models)} clients")

        # Final results
        feature_names = ['Name_Length', 'Name_Words', 'Age_Shifted', 'Phone_Digits', 'Phone_Length']
        print(f"\nFinal Federated Model Results (after {rounds} rounds):")
        print(f" PII Protection: 100% (PII never left client devices)")
        print(f" Data Utility: Preserved through federated training")
        print(f" Data Volume: {len(client_datasets[0]) * num_clients} total records")
        
        if global_model_params:
            print("\n Aggregated Feature Importance (Global Model):")
            for name, importance in zip(feature_names, global_model_params['feature_importances']):
                print(f"  {name}: {importance:.3f}")
        
        print(f"\nFederated Learning complete.")
        return global_model_params



## Output

In [7]:
# Execute the corrected federated learning
if __name__ == "__main__":
    fl_system = FederatedPIIGradientBoost()
    results = fl_system.federated_train()

 FEDERATED LEARNING - GRADIENT BOOSTING
---------------------------------------------
Created 5 client datasets, each with 2000 records.

--- Round 1 ---
  Client 1: Local accuracy: 0.887
  Client 2: Local accuracy: 0.887
  Client 3: Local accuracy: 0.887
  Client 4: Local accuracy: 0.905
  Client 5: Local accuracy: 0.885
  Server: Aggregated parameters from 5 clients

--- Round 2 ---
  Client 1: Local accuracy: 0.873
  Client 2: Local accuracy: 0.875
  Client 3: Local accuracy: 0.892
  Client 4: Local accuracy: 0.905
  Client 5: Local accuracy: 0.880
  Server: Aggregated parameters from 5 clients

--- Round 3 ---
  Client 1: Local accuracy: 0.887
  Client 2: Local accuracy: 0.907
  Client 3: Local accuracy: 0.892
  Client 4: Local accuracy: 0.902
  Client 5: Local accuracy: 0.892
  Server: Aggregated parameters from 5 clients

Final Federated Model Results (after 3 rounds):
 PII Protection: 100% (PII never left client devices)
 Data Utility: Preserved through federated training
 Data 