## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.dask

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
from sklearn import metrics

import scipy

import dask
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, SSHCluster
import dask.distributed

import dask_ml
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import LabelEncoder

import xgboost as xgb

import mpl_scatter_density
import matplotlib.colors as colors

import csv

from dask_ml.preprocessing import OneHotEncoder 

## Setting up the cluster

In [2]:
cluster = SSHCluster(["bhbh-1", "bhbh-1", "bhbh-2", "bhbh-3", "bhbh-4", "bhbh-5"],
                    connect_options={"client_keys": "/home/ubuntu/private/tbertola_key.pem"},
                    worker_options={"n_workers": 4,
                                    "nthreads": 1}, # because each bhbh-* has 4 cores
                    scheduler_options={"port": 8786, "dashboard_address": ":8787"}
                    )

2023-06-29 11:01:44,142 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:44,140 - distributed.scheduler - INFO - State start
2023-06-29 11:01:44,144 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:44,142 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-1gfsd3q_', purging
2023-06-29 11:01:44,146 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:44,144 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-7au9o9a1', purging
2023-06-29 11:01:44,147 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:44,144 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-scratch-space/worker-_rlfq4no', purging
2023-06-29 11:01:44,151 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:44,150 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.140:8786
2023-06-29 11:01:45,080 - distributed.deploy.ssh - INFO - 2023-06-29 11:01:45,079 - distributed.nanny - INFO -

In [3]:
client=Client(cluster)


+---------+--------+-----------+------------------+
| Package | Client | Scheduler | Workers          |
+---------+--------+-----------+------------------+
| tornado | 6.3.2  | 6.3.2     | {'6.2', '6.3.2'} |
+---------+--------+-----------+------------------+


In [4]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.SpecCluster
Dashboard: http://10.67.22.140:8787/status,

0,1
Dashboard: http://10.67.22.140:8787/status,Workers: 16
Total threads: 16,Total memory: 31.02 GiB

0,1
Comm: tcp://10.67.22.140:8786,Workers: 16
Dashboard: http://10.67.22.140:8787/status,Total threads: 16
Started: Just now,Total memory: 31.02 GiB

0,1
Comm: tcp://10.67.22.140:36603,Total threads: 1
Dashboard: http://10.67.22.140:33115/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.140:42429,
Local directory: /tmp/dask-scratch-space/worker-cv_lgt3u,Local directory: /tmp/dask-scratch-space/worker-cv_lgt3u

0,1
Comm: tcp://10.67.22.140:37033,Total threads: 1
Dashboard: http://10.67.22.140:40745/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.140:40699,
Local directory: /tmp/dask-scratch-space/worker-50of4cjr,Local directory: /tmp/dask-scratch-space/worker-50of4cjr

0,1
Comm: tcp://10.67.22.140:37167,Total threads: 1
Dashboard: http://10.67.22.140:44453/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.140:39293,
Local directory: /tmp/dask-scratch-space/worker-y4y13x5p,Local directory: /tmp/dask-scratch-space/worker-y4y13x5p

0,1
Comm: tcp://10.67.22.140:42213,Total threads: 1
Dashboard: http://10.67.22.140:43969/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.140:40547,
Local directory: /tmp/dask-scratch-space/worker-2qcgap2e,Local directory: /tmp/dask-scratch-space/worker-2qcgap2e

0,1
Comm: tcp://10.67.22.21:41331,Total threads: 1
Dashboard: http://10.67.22.21:45469/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.21:43013,
Local directory: /tmp/dask-scratch-space/worker-jaeo1dkq,Local directory: /tmp/dask-scratch-space/worker-jaeo1dkq

0,1
Comm: tcp://10.67.22.21:42341,Total threads: 1
Dashboard: http://10.67.22.21:34483/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.21:44627,
Local directory: /tmp/dask-scratch-space/worker-q2q3_4rp,Local directory: /tmp/dask-scratch-space/worker-q2q3_4rp

0,1
Comm: tcp://10.67.22.21:43339,Total threads: 1
Dashboard: http://10.67.22.21:34741/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.21:34921,
Local directory: /tmp/dask-scratch-space/worker-db85u2o_,Local directory: /tmp/dask-scratch-space/worker-db85u2o_

0,1
Comm: tcp://10.67.22.21:46413,Total threads: 1
Dashboard: http://10.67.22.21:34687/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.21:42099,
Local directory: /tmp/dask-scratch-space/worker-wmv5ukwx,Local directory: /tmp/dask-scratch-space/worker-wmv5ukwx

0,1
Comm: tcp://10.67.22.220:34925,Total threads: 1
Dashboard: http://10.67.22.220:38535/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.220:39531,
Local directory: /tmp/dask-scratch-space/worker-1fgo9t7g,Local directory: /tmp/dask-scratch-space/worker-1fgo9t7g

0,1
Comm: tcp://10.67.22.220:35913,Total threads: 1
Dashboard: http://10.67.22.220:36743/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.220:35039,
Local directory: /tmp/dask-scratch-space/worker-p3hlvu98,Local directory: /tmp/dask-scratch-space/worker-p3hlvu98

0,1
Comm: tcp://10.67.22.220:42157,Total threads: 1
Dashboard: http://10.67.22.220:33757/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.220:34893,
Local directory: /tmp/dask-scratch-space/worker-4miy35sa,Local directory: /tmp/dask-scratch-space/worker-4miy35sa

0,1
Comm: tcp://10.67.22.220:42759,Total threads: 1
Dashboard: http://10.67.22.220:39999/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.220:34649,
Local directory: /tmp/dask-scratch-space/worker-59gfr1cw,Local directory: /tmp/dask-scratch-space/worker-59gfr1cw

0,1
Comm: tcp://10.67.22.31:34085,Total threads: 1
Dashboard: http://10.67.22.31:46091/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.31:34863,
Local directory: /tmp/dask-scratch-space/worker-xntm4wfm,Local directory: /tmp/dask-scratch-space/worker-xntm4wfm

0,1
Comm: tcp://10.67.22.31:42307,Total threads: 1
Dashboard: http://10.67.22.31:37215/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.31:36315,
Local directory: /tmp/dask-scratch-space/worker-5ojfun5l,Local directory: /tmp/dask-scratch-space/worker-5ojfun5l

0,1
Comm: tcp://10.67.22.31:42921,Total threads: 1
Dashboard: http://10.67.22.31:35009/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.31:41599,
Local directory: /tmp/dask-scratch-space/worker-bon1zl6g,Local directory: /tmp/dask-scratch-space/worker-bon1zl6g

0,1
Comm: tcp://10.67.22.31:45431,Total threads: 1
Dashboard: http://10.67.22.31:43577/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.31:41827,
Local directory: /tmp/dask-scratch-space/worker-csddfpdp,Local directory: /tmp/dask-scratch-space/worker-csddfpdp


## Data

In [5]:
#defin the value of q to drop
q_drop = 0.2

#import data
bhbh = dd.read_parquet('/home/ubuntu/data/bag_all_dataset_with_no_kick/*')

#Masking on the data
bhbh = bhbh[bhbh['q'] > q_drop] #dropping the value of q under q_drop
#bhbh = bhbh[ bhbh['No_Kick'] == 2] #taking only the binaries without kick

## Discretizing and introducing labels

In [6]:
#add column label
n_bins = 100
# Define the edges between bins
counts, bin_edges = da.histogram(bhbh.q.values, bins=n_bins, range=(q_drop,1.0))

# pd.cut each column, with each bin closed on left and open on right
bhbh['label'] = bhbh['q'].map_partitions(pd.cut, bin_edges, right=True, labels=False, include_lowest=False)

## Splitting the data in train and set

## Sample weigths

In [8]:
countings = bhbh_train['label'].to_frame().groupby('label').size().compute()
total_counts = countings.sum()

#Assign to a new column
def assign_weight(i):
    return np.log(total_counts/(n_bins*countings[i])+1) 

bhbh_train['weight'] = bhbh_train['label'].map(assign_weight)

## Plot of weights respect to labels

## Rename of useful columns

In [9]:
#training
weights = bhbh_train['weight']
labels = bhbh_train['label']
features = bhbh_train[['Mass_0', 'Z_0', 'Mass_1',  'a', 'e', 'alpha']]

#test
features_test = bhbh_test[['Mass_0', 'Z_0', 'Mass_1',  'a', 'e', 'alpha']]
label_test = bhbh_test['label']

In [10]:
def one_hot_enc(i):
    c = np.zeros(n_bins)
    c[i]=1
    return c

one_hot_labels = labels.map(one_hot_enc)

## Consistency of partitions

In [11]:
print('Consistency check')
print('features:', type(features), features.npartitions)

print('label', type(labels), labels.npartitions)
print('weight', type(weights), weights.npartitions)

print('one hot encoded label', type(one_hot_labels), one_hot_labels.npartitions)

Consistency check
features: <class 'dask.dataframe.core.DataFrame'> 240
label <class 'dask.dataframe.core.Series'> 240
weight <class 'dask.dataframe.core.Series'> 240
one hot encoded label <class 'dask.dataframe.core.Series'> 240


## XGBoost

## Custom objective function EMD

In [12]:
#EMD loss function
def cdf_emd(x, y):
    """Compute the CDF-EMD loss between two probability distributions."""
    n = x.shape[0]
    A = np.tril(np.ones((n, n)))
    res = A @ (x - y)
    res = (1 / n) * np.dot(res, res)
    return res

#first and second derivatives
def cdf_emd_grad(x, y):
    """Compute the gradient of the CDF-EMD loss."""
    n = x.shape[0]
    A = np.tril(np.ones((n, n)))
    res = (2 / n) * A.T @ A @ (x - y)
    return res

def cdf_emd_hess(x, y):
    """Compute the Hessian of the CDF-EMD loss."""
    n = x.shape[0]
    A = np.tril(np.ones((n, n)))
    res = (2 / n) * A.T @ A
    return res

#Custom objective function (it uses loss function, first and second derivatives)
def CDF_EMD_LOSS(data: np.ndarray, predt: np.ndarray):
    data = data.reshape(predt.shape)
    kRows, kClasses = predt.shape
    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float) 
    
    for r in range(kRows):
        x = predt[r].copy()
        y = data[r].copy()
        grad[r, :] = cdf_emd_grad(x, y)
        hess[r, :] = np.diag(cdf_emd_hess(x, y))

    grad = grad.reshape((kRows * kClasses, 1))
    hess = hess.reshape((kRows * kClasses, 1))
    return grad, hess

In [16]:
#parameters
max_depth = 8
learning_rate = 0.2
grow_policy = 'lossguide'
n_estimators = 5
max_leaves = 400

param = {'objective': CDF_EMD_LOSS,
         'tree_method': 'hist', 
         'n_estimators' : n_estimators, #equivalent to num_boost_round, lets keep it in params
         'grow_policy' : grow_policy, 
         'max_depth' : max_depth,
         'learning_rate' : learning_rate,
         'max_leaves' : max_leaves,
         'num_class': n_bins,
         'n_jobs' : -1,
        }

In [17]:
xgbclassifier = xgb.dask.DaskXGBClassifier(objective = CDF_EMD_LOSS,
                                             n_estimators = n_estimators, #equivalent to num_boost_round, lets keep it in params
                                             grow_policy = grow_policy, 
                                             max_depth = max_depth,
                                             learning_rate = learning_rate,
                                             num_class =  n_bins,
                                             n_jobs = -1,
                                             verbosity = 2)
xgbclassifier.client = client # not used

In [18]:
xgbclass = xgbclassifier.fit(X=features, y=one_hot_labels)

KeyboardInterrupt: 

# Saving the model

In [None]:
bst['booster'].save_model(f'custom_objective_bst_{max_depth}_{max_leaves}_new.json')