# Goal:  PII De-identification Engine with Gradient Boosting Classifier

In [1]:
# Install required packages
!pip install -q -U pandas numpy faker python-dateutil scikit-learn
print("Installation Complete")


Installation Complete


## Load Libraries

In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import random

## Core function

In [14]:
class FederatedPIIGradientBoost:
    def __init__(self):
        self.fake = Faker()
        self.model = GradientBoostingClassifier(n_estimators=100, max_depth=4, learning_rate=0.96, random_state=42)
        self.scaler = StandardScaler()

    ##  Generate TEST (healthcare data) with 4 features only for Adults 18 and over.
    def create_data(self, n=500):
        return pd.DataFrame([{
            'name': (name := self.fake.name()),
            'dob': (datetime.now() - timedelta(days=(age := random.randint(18, 80))*365)).strftime('%Y-%m-%d'),
            'phone': self.fake.phone_number(),
            'risk_score': age * 1.2 + len(name) * 0.5 + (20 if age > 80 else 0) + random.uniform(10, 40)
        } for _ in range(n)])

    ## De-identify PII while preserving data relationships
    def deidentify(self, df):
        name_map = {name: self.fake.name() for name in df['name'].unique()}
        return df.assign(
            name=df['name'].map(name_map),
            dob=df['dob'].apply(lambda x: (datetime.strptime(x, '%Y-%m-%d') + timedelta(days=100)).strftime('%Y-%m-%d')),
            phone=df['phone'].apply(lambda x: self.fake.phone_number())
        )

    ## Extract optimized features from 4 core fields: Name, DoB, Phone, Risk
    def extract_features(self, df):
        return np.column_stack([
            df['name'].str.len(),                                    # Name length
            df['name'].str.split().str.len(),                       # Name word count  
            2024 - pd.to_datetime(df['dob']).dt.year,              # Age from DoB
            df['phone'].str.count(r'\d'),                           # Phone digit count
            df['phone'].str.len()                                   # Phone length
        ])

    ##  Gradient Boosting federated learning with 4 constant features
    def federated_train(self, rounds=3):
        print(" GRADIENT BOOST FEDERATED LEARNING - 4 CORE FEATURES")
        print("-" * 55)
        
        # Create data
        original_data = self.create_data(10000)
        deidentified_data = self.deidentify(original_data)
        
        print(f"Enhanced Dataset: {len(original_data)} records")
        print(f"Features: Name, DoB, Phone, Risk_Score (constant)")
        print(f"\n PII De-identification Sample:")
        original_data = self.create_data(10000)
        print(f" Original Data Sample:\n{original_data.head(5)}\n")
        
        # De-identify data
        deidentified_data = self.deidentify(original_data)
        print(f" De-identified Data Sample:\n{deidentified_data.head(5)}\n")

        # Feature extraction and scaling
        X = self.extract_features(deidentified_data)
        y = (deidentified_data['risk_score'] > deidentified_data['risk_score'].median()).astype(int)
        X_scaled = self.scaler.fit_transform(X)
        
        # Enhanced Gradient Boosting federated training
        print(f"\n Gradient Boosting Training (150 estimators, depth=4):")
        accuracies, feature_importances = [], []
        
        for num in range(1, rounds + 1):
            # Simulate federated client with 80% random sample
            client_idx = np.random.choice(len(X_scaled), size=int(len(X_scaled)*0.8), replace=False)
            
            # Train on client dataa
            self.model.fit(X_scaled[client_idx], y.iloc[client_idx])
            
            # Evaluate on full dataset
            accuracy = accuracy_score(y, self.model.predict(X_scaled))
            accuracies.append(accuracy)
            feature_importances.append(self.model.feature_importances_.copy())
            
            print(f"  Round {num}: Accuracy = {accuracy:.3f}")
        
        # Performance analysis with enhanced metrics
        avg_accuracy = np.mean(accuracies)
        best_accuracy = max(accuracies)
        stability = np.std(accuracies)
        
        print(f"\n Enhanced Performance Analysis:")
        print(f" Average Accuracy: {avg_accuracy:.3f}")
        print(f" Best Round: {best_accuracy:.3f}")
        print(f" Performance Range: {min(accuracies):.3f} - {max(accuracies):.3f}")
        print(f" Stability (StdDev): {stability:.3f}")
        
        # Average feature importance across all rounds
        avg_feature_importance = np.mean(feature_importances, axis=0)
        feature_names = ['Name_Length', 'Name_Words', 'Age_Shifted', 'Phone_Digits', 'Phone_Length']
        
        print(f"\n Average Feature Importance (Gradient Boost):")
        for name, importance in zip(feature_names, avg_feature_importance):
            print(f"  {name}: {importance:.3f}")
        
        print(f"\n Enhanced Federated Learning Complete!")
        print(f" Gradient Boosting: {best_accuracy:.3f} peak accuracy")
        print(f" PII Protection: 100% (Name, DoB, Phone de-identified)")
        print(f" Data Utility: Preserved for ML with {len(original_data)} records")



## Output

In [15]:
# Execute enhanced gradient boosting federated learning
FederatedPIIGradientBoost().federated_train()

 GRADIENT BOOST FEDERATED LEARNING - 4 CORE FEATURES
-------------------------------------------------------
Enhanced Dataset: 10000 records
Features: Name, DoB, Phone, Risk_Score (constant)

 PII De-identification Sample:
 Original Data Sample:
                 name         dob               phone  risk_score
0        Erin Mcclure  1987-08-27        207.582.0790   76.102502
1           Jodi Soto  2002-08-23     +1-775-429-9682   56.759028
2  Stephanie Martinez  2007-08-22     +1-678-409-4485   44.483516
3      Olivia Simpson  1999-08-24  865.589.6733x72498   59.065638
4    Patricia Hopkins  1950-09-05     +1-570-954-1219  123.119355

 De-identified Data Sample:
                 name         dob                   phone  risk_score
0     Veronica Arroyo  1987-12-05        408-494-1856x670   76.102502
1       Laura Stewart  2002-12-01      200-306-2757x77764   56.759028
2       Brenda Barker  2007-11-30       (853)837-5366x121   44.483516
3           Jason Lee  1999-12-02    +1-743-820-7