In [1]:
import numpy as np
import xgboost as xgb
import numpy as np
import pandas as pd
import cudf
import time
import os

## Download the Higgs Challenge data from CERN

## Helper functions

In [2]:
import gzip
def load_data(cached = '/rapids/notebooks/data/atlas-higgs-challenge-2014-v2.csv.gz'):
    if os.path.exists(cached):
        print("decompressing CERN's Higgs data")
        with gzip.open(cached) as f:
            #X = pd.read_csv(cached)
            X = cudf.read_csv(cached)
    else:
        try:
            %cd /rapids/notebooks/cuml/data
            !wget http://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz
            print("decompressing CERN's Higgs data")
            with gzip.open(cached) as f:
                X = pd.read_csv(cached)
        except:
            print("Please first download the data from http://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz using the above cell and rerun this cell.  If you can't connect to the datasource, please raise an issue in RAPIDS notebooks-extended repo")
    return X

## Load in training data using Pandas

In [3]:
df = load_data()
print ('Finished loading from csv ')

decompressing CERN's Higgs data
Finished loading from csv 


In [4]:
df.dtypes

EventId                          int64
DER_mass_MMC                   float64
DER_mass_transverse_met_lep    float64
DER_mass_vis                   float64
DER_pt_h                       float64
DER_deltaeta_jet_jet           float64
DER_mass_jet_jet               float64
DER_prodeta_jet_jet            float64
DER_deltar_tau_lep             float64
DER_pt_tot                     float64
DER_sum_pt                     float64
DER_pt_ratio_lep_tau           float64
DER_met_phi_centrality         float64
DER_lep_eta_centrality         float64
PRI_tau_pt                     float64
PRI_tau_eta                    float64
PRI_tau_phi                    float64
PRI_lep_pt                     float64
PRI_lep_eta                    float64
PRI_lep_phi                    float64
PRI_met                        float64
PRI_met_phi                    float64
PRI_met_sumet                  float64
PRI_jet_num                      int64
PRI_jet_leading_pt             float64
PRI_jet_leading_eta      

## Let's look at our two "object" type data columns

In [5]:
Label_pdf =df['Label'].to_pandas()
Label_pdf.unique()

array(['s', 'b'], dtype=object)

In [6]:
KaggleSet_pdf =df['KaggleSet'].to_pandas()
KaggleSet_pdf.unique()

array(['t', 'b', 'v', 'u'], dtype=object)

In [7]:
# Let's convert these values to a numeric format before going further
df['Label'] = df.Label.str.replace('b', '0')
df['Label'] = df.Label.str.replace('s', '1')
df['Label'] = df['Label'].str.stoi()
df['KaggleSet'] = df.KaggleSet.str.replace('b', '0')
df['KaggleSet'] = df.KaggleSet.str.replace('t', '1')
df['KaggleSet'] = df.KaggleSet.str.replace('v', '2')
df['KaggleSet'] = df.KaggleSet.str.replace('u', '3')
df['KaggleSet'] = df['KaggleSet'].str.stoi()

## Let's look at the data again

In [8]:
Label_pdf =df['Label'].to_pandas()
Label_pdf.unique()

array([1, 0])

In [9]:
KaggleSet_pdf =df['KaggleSet'].to_pandas()
KaggleSet_pdf.unique()

array([1, 0, 2, 3])

In [10]:
# Since the dataset is unbalanced, we can provide XGBoost with the ratio of negative class to positive class to weight the observations 
df_train = df[df['KaggleSet']==1]
df_test = df[df['KaggleSet']==0]
sample_size = len(df)
test_size = len(df_train)
train_size = len(df_test)

# rescale weight to make it same as test set
weight = df_train['Weight']
print(weight)
weight = weight * (float(test_size) / float(train_size))
print(weight)
print(sample_size)
print(train_size)
print(test_size)
sum_wpos = sum(weight[df_train.Label == 1])
sum_wneg = sum(weight[df_train.Label == 0])

# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

0          0.00081448039868
1        0.6810419068060001
2        0.7157420063490002
3             1.66065435355
4        1.9042634411800001
5           0.0254337596084
6          0.00081448039868
7          0.00572068250088
8        1.6148034660000001
9    0.00046102535673400003
[249990 more rows]
Name: Weight, dtype: float64
0         0.0020362009967
1      1.7026047670150002
2      1.7893550158725005
3      4.1516358838750005
4           4.76065860295
5     0.06358439902100001
6         0.0020362009967
7    0.014301706252200001
8             4.037008665
9       0.001152563391835
[249990 more rows]
dtype: float64
818238
100000
250000
weight statistics: wpos=531.045, wneg=313295, ratio=589.959


In [None]:
# convert the panda dataframe into a gpu dataframe
## We drop the first column and the last column.
gdf_train = df_train.drop(["DER_mass_MMC","Label"])
gdf_test = df_test.drop(["DER_mass_MMC","Label"])

In [11]:
# You can visualize GDF just like you do with DF
print(gdf_train.dtypes)

NameError: name 'gdf_train' is not defined

In [12]:
#Here is an example of type conversion
#it's also important to note that the XGBoost DMatrix constructor expects all data to be of the same type
df_train['PRI_jet_num'] = df_train['PRI_jet_num'].astype(np.float64)
df_test['PRI_jet_num'] = df_test['PRI_jet_num'].astype(np.float64)

In [13]:
print(df_train.head())

   EventId  DER_mass_MMC  DER_mass_transverse_met_lep       DER_mass_vis             DER_pt_h  DER_deltaeta_jet_jet    DER_mass_jet_jet ...           KaggleWeight
0   100000        138.47                       51.655             97.827   27.979999999999997                  0.91  124.71100000000001 ...  0.0026533113373299996
1   100001       160.937                       68.768            103.235               48.146                -999.0              -999.0 ...     2.2335844871700004
2   100002        -999.0                      162.172            125.953   35.635000000000005                -999.0              -999.0 ...          2.34738894364
3   100003       143.905            81.41700000000002  80.94300000000001  0.41400000000000003                -999.0              -999.0 ...      5.446378211920001
4   100004       175.864                       16.915            134.805   16.404999999999998                -999.0              -999.0 ...          6.24533268686
[27 more columns]


In [15]:
# Check that the conversion happened
df_train.dtypes.PRI_jet_num

dtype('float64')

In [16]:
y_train = df_train[['Label']] #Note that y is pygdf dataframe, not a pygdf series
df_train.drop_column('Label')
df_train.drop_column('Weight')
X_train = df_train

xgmat_train = xgb.DMatrix(data=X_train,label=y_train, missing = -999.0, weight=weight)

In [17]:
nTrees = 1000

# setup parameters for xgboost
param = {}

# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos

param['eta']              = 0.01
param['max_depth']        = 10
param['min_child_weight'] = 100
param['colsampleby_tree'] = 0.5
param['gamma']            = 0.1
param['sub_sample']       = 0.9
param['silent']           = 1
param['nthread']          = 8
param['verbose']          = 0
param['eval_metric']      = 'ams@0.15'
param['tree_method'] = 'gpu_hist'
param['objective'] = 'gpu:binary:logitraw'
watchlist = [(xgmat_train,'train')]

In [18]:
tmp = time.time()
bst = xgb.train(param, xgmat_train, nTrees, watchlist, verbose_eval=10);
print ("XGBoost training: %s seconds" % (str(time.time() - tmp)))

[0]	train-ams@0.15:17.7275
[10]	train-ams@0.15:17.7275
[20]	train-ams@0.15:17.7275
[30]	train-ams@0.15:17.7275
[40]	train-ams@0.15:25.24
[50]	train-ams@0.15:25.2921
[60]	train-ams@0.15:25.3215
[70]	train-ams@0.15:25.3215
[80]	train-ams@0.15:25.3215
[90]	train-ams@0.15:25.3215
[100]	train-ams@0.15:25.3215
[110]	train-ams@0.15:25.3215
[120]	train-ams@0.15:25.3215
[130]	train-ams@0.15:25.3215
[140]	train-ams@0.15:25.3215
[150]	train-ams@0.15:25.3215
[160]	train-ams@0.15:25.3215
[170]	train-ams@0.15:25.3215
[180]	train-ams@0.15:25.3215
[190]	train-ams@0.15:25.3215
[200]	train-ams@0.15:25.3215
[210]	train-ams@0.15:25.3215
[220]	train-ams@0.15:29.5981
[230]	train-ams@0.15:29.598
[240]	train-ams@0.15:29.6086
[250]	train-ams@0.15:29.606
[260]	train-ams@0.15:29.606
[270]	train-ams@0.15:29.606
[280]	train-ams@0.15:29.606
[290]	train-ams@0.15:29.6001
[300]	train-ams@0.15:29.6001
[310]	train-ams@0.15:29.6144
[320]	train-ams@0.15:29.6031
[330]	train-ams@0.15:29.6239
[340]	train-ams@0.15:31.1043
[35

In [20]:
#Note that y is pygdf dataframe, not a pygdf series
y_t = df_test[['Label']] 
df_test.drop_column('Label')
df_test.drop_column('Weight')

X_t = df_test

xgmat_t = xgb.DMatrix(data=X_t,label=y_t, missing = -999.0, weight=weight)
y_pred = bst.predict(xgmat_t)
print(y_pred)

[-8.579436 -8.738063 -8.769301 ... -7.978996 -8.570828  8.126481]
