In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import sys
import traceback


# prep
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler

# models
from torch import nn
from torch import optim
from torch.autograd import Variable
import os
import sys

import torchcsprng as csprng

from opacus import PrivacyEngine
from collections import OrderedDict

ModuleNotFoundError: No module named 'torch'

In [2]:
try: privacy_engine
except NameError: print("The Privacy Engine is already Detached")
else: privacy_engine.detach()

The Privacy Engine is already Detached


In [3]:
    #Read in Data
    train_df = pd.read_csv('hospital1.csv')


    ########## START DATA CLEANING ###############


    #dealing with missing data
    #Let’s get rid of the variables "Timestamp",“comments”, “state” just to make our lives easier.
    train_df = train_df.drop(['comments'], axis= 1)
    train_df = train_df.drop(['state'], axis= 1)
    train_df = train_df.drop(['Timestamp'], axis= 1)

    # Assign default values for each data type
    defaultInt = 0
    defaultString = 'NaN'
    defaultFloat = 0.0

    # Create lists by data tpe
    intFeatures = ['Age']
    stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                     'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                     'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                     'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                     'seek_help']
    floatFeatures = []

    # Clean the NaN's
    for feature in train_df:
        if feature in intFeatures:
            train_df[feature] = train_df[feature].fillna(defaultInt)
        elif feature in stringFeatures:
            train_df[feature] = train_df[feature].fillna(defaultString)
        elif feature in floatFeatures:
            train_df[feature] = train_df[feature].fillna(defaultFloat)
        else:
            log_msg('Error: Feature %s not recognized.' % feature)

    #clean 'Gender'
    #Slower case all columm's elements
    gender = train_df['Gender'].str.lower()
    #log_msg(gender)

    #Select unique elements
    gender = train_df['Gender'].unique()

    #Made gender groups
    male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
    trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]
    female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

    for (row, col) in train_df.iterrows():

        if str.lower(col.Gender) in male_str:
            train_df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

        if str.lower(col.Gender) in female_str:
            train_df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

        if str.lower(col.Gender) in trans_str:
            train_df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

    #Get rid of bullshit
    stk_list = ['A little about you', 'p']
    train_df = train_df[~train_df['Gender'].isin(stk_list)]

    #complete missing age with mean
    train_df['Age'].fillna(train_df['Age'].median(), inplace = True)

    # Fill with media() values < 18 and > 120
    s = pd.Series(train_df['Age'])
    s[s<18] = train_df['Age'].median()
    train_df['Age'] = s
    s = pd.Series(train_df['Age'])
    s[s>120] = train_df['Age'].median()
    train_df['Age'] = s

    #Ranges of Age
    train_df['age_range'] = pd.cut(train_df['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

    #There are only 0.20% of self work_interfere so let's change NaN to "Don't know
    #Replace "NaN" string from defaultString

    train_df['work_interfere'] = train_df['work_interfere'].replace([defaultString], 'Don\'t know' )

    #Encoding data
    labelDict = {}
    for feature in train_df:
        le = preprocessing.LabelEncoder()
        le.fit(train_df[feature])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        train_df[feature] = le.transform(train_df[feature])
        # Get labels
        labelKey = 'label_' + feature
        labelValue = [*le_name_mapping]
        labelDict[labelKey] =labelValue

    #Get rid of 'Country'
    train_df = train_df.drop(['Country'], axis= 1)

    # Scaling Age
    scaler = MinMaxScaler()
    train_df['Age'] = scaler.fit_transform(train_df[['Age']])

    # define X and y
    feature_cols = ['Age', 'Gender', 'family_history', 'benefits', 'care_options', 'anonymity', 'leave', 'work_interfere']
    X = train_df[feature_cols]
    y = train_df.treatment

    # split X and y into training and testing sets
    X_train, y_train = X, y

    # Transform pandas dataframe to torch tensor for DL

    x_train_data = torch.from_numpy(X_train.values)
    x_train_data = x_train_data.float()

    y_train_data = []
    for data in y_train.values:
        y_train_data.append([data])
    y_train_data = torch.tensor(y_train_data).float()

    len(y_train_data)

333

In [4]:
# We could move this block to a Researcher's notebook

model = nn.Sequential(
            nn.Linear(8, 4),
            nn.Sigmoid(),
            nn.Linear(4, 2),
            nn.Sigmoid(),
            nn.Linear(2, 1),
            nn.Sigmoid()
)

# model = nn.Sequential(
#         nn.Linear(8, 4),
#         nn.ReLU(),
#         nn.Linear(4, 2),
#         nn.ReLU(),
#         nn.Linear(2, 1),
#         nn.Sigmoid()
#     )
# define model architecture
# model = nn.Sequential(OrderedDict([
#     ('fc1', nn.Linear(8, 4)),
#     ('relu1', nn.ReLU()),
#     ('fc2', nn.Linear(4, 2)),
#     ('relu2', nn.ReLU()),
#     ('fc3', nn.Linear(2, 1)),
#     ('sigmoid', nn.Sigmoid())
# ]))

torch.save(model, "untrained_model.pt")

# In a Researcher's notebook after saving the model, we have to send it to the Hospitals

In [5]:
# The Hospitals would receive the model and print it to check that everything works fine.
print(model)

Sequential(
  (0): Linear(in_features=8, out_features=4, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=4, out_features=2, bias=True)
  (3): Sigmoid()
  (4): Linear(in_features=2, out_features=1, bias=True)
  (5): Sigmoid()
)


In [6]:
# Training Logic
opt = optim.SGD(params=model.parameters(), lr=0.1)

# opt = torch.optim.SGD(model.parameters(), lr=0.05)

# Apply Differential Privacy from Opacus library (Renyi DP)

privacy_engine = PrivacyEngine(model, batch_size=333, sample_size=1000, alphas=range(2,32), 
                               noise_multiplier=1.3, max_grad_norm=1.0,)

privacy_engine.attach(opt)



In [7]:
def train():

    for iter in range(50000):

        # 1) erase previous gradients (if they exist)
        opt.zero_grad()
    
        # log_msg("TRAIN DATA", x_train_data)

        # 2) make a prediction
        pred = model(x_train_data)

        # 3) calculate how much we missed
        loss = (((y_train_data - pred) ** 2).sum()) / len(x_train_data)

        # 4) figure out which weights caused us to miss
        loss.backward()

        # 5) change those weights
        opt.step()
        
        # 6) log_msg our progress
        if (iter % 5000 == 0):
            print("loss at epoch ", iter, ": ", loss.data)

In [8]:
train()

loss at epoch  0 :  tensor(0.2560)
loss at epoch  5000 :  tensor(0.1331)
loss at epoch  10000 :  tensor(0.1229)
loss at epoch  15000 :  tensor(0.1140)
loss at epoch  20000 :  tensor(0.1120)
loss at epoch  25000 :  tensor(0.1109)
loss at epoch  30000 :  tensor(0.1102)
loss at epoch  35000 :  tensor(0.1094)
loss at epoch  40000 :  tensor(0.1088)
loss at epoch  45000 :  tensor(0.1084)
