In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [3]:
import sys
sys.path.insert(1, '../')

In [4]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from nv_tabular.preproc import Workflow
from nv_tabular.ops import Normalize, FillMissing, Categorify, Moments, Median, Encoder, LogOp, ZeroFill
from nv_tabular.dl_encoder import DLLabelEncoder
from nv_tabular.ds_iterator import GPUDatasetIterator
from nv_tabular.batchloader import FileItrDataset, DLCollator, DLDataLoader
import warnings

import matplotlib.pyplot as plt
%matplotlib inline

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [5]:
torch.__version__, cudf.__version__

('1.5.0.dev20200212', '0.13.0a+1692.g11a0e42cd')

In [6]:
%load_ext snakeviz
# load snakeviz if you want to run profiling

In [7]:
# to_cpu = True

# <h3> Dataset Gathering: Define files in the training and validation datasets. </h3>

In [8]:
# data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
data_path = '/datasets/criteo/raw_csvs/split_train_data/'
#df_test = 'test/'
df_valid = ''
df_train = ''
split =  8
fin = 25

train_days = ["day_" + str(x) for x in range(split)]
valid_days = ["day_" + str(x) for x in range(split, fin)]
print(train_days, valid_days)

train_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if x.startswith("day") and x.split("_part")[0] in train_days] 
valid_set = [data_path + df_valid + x for x in os.listdir(data_path + df_valid) if x.startswith("day") and x.split("_part")[0] in valid_days] 

['day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7'] ['day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24']


In [9]:
len(train_set), len(valid_set)

(716, 1277)

In [15]:
chunk_p = cudf.read_parquet("/datasets/criteo/raw_csvs/split_train_data_parquet/part.1.parquet", skip_rows=1000, num_rows=1000)

In [16]:
chunk = cudf.read_csv(train_set[0], nrows=100)

<h4>Grab column information</h4>

In [None]:
cont_names = ['I' + str(x) for x in range(1,14)]
cat_names =  ['C' + str(x) for x in range(1,27)]
cat_names, cont_names

In [None]:
cols = ['label']  + cont_names + cat_names
cols

<h3>Preprocessing:</h3> <p>Select operations to perform, create the Preprocessor object, create dataset iterator object and collect the stats on the training dataset</p>

In [None]:
%%time
a,b = Normalize(), Categorify()
procs = [a, b]

In [None]:
%%time
proc = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=['label'], feat_ops=[ZeroFill(), LogOp()], df_ops=procs, to_cpu=to_cpu)
# proc = Preprocessor(cat_names=cat_names, cont_names=cont_names, label_name=['label'], config=config, to_cpu=to_cpu)

In [None]:
%%time
proc.add_features([ZeroFill(), LogOp()])
proc.add_preprocessing(Normalize())
proc.add_preprocessing(Categorify())

In [None]:
%%time
trains_itrs = GPUDatasetIterator(train_set, names=cols, engine='csv', sep='\t')

In [None]:
%%time
proc.update_stats(trains_itrs)

In [None]:
proc.stats

<h5>Gather embeddings using statistics gathered in the Read phase.</h5>

In [None]:
embeddings = [x[1] for x in b.get_emb_sz(proc.stats["categories"], proc.cat_names)]

<h5>Create the file iterators using the FileItrDataset Class.</h5>

In [None]:
%%time
t_batch_sets = [FileItrDataset(x, names=cols, engine='csv', batch_size=10000, sep="\t") for x in train_set]
v_batch_sets = [FileItrDataset(x, names=cols, engine='csv', batch_size=10000, sep="\t") for x in valid_set]

In [None]:
%%time
t_chain = torch.utils.data.ChainDataset(t_batch_sets)
v_chain = torch.utils.data.ChainDataset(v_batch_sets)

<h5>Use the Deep Learning Collator to create a collate function to pass to the dataloader.</h5>

In [None]:
%%time
dlc = DLCollator(preproc=proc)

In [None]:
%%time
t_data = DLDataLoader(t_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)
v_data = DLDataLoader(v_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)

<h4>After creating the Dataloaders you can leverage fastai framework to create Machine Learning models</h4>

In [None]:
databunch = DataBunch(t_data, v_data, collate_fn=dlc.gdf_col, device="cuda")

In [None]:
%%time
model = TabularModel(emb_szs = embeddings, n_cont=len(cont_names), out_sz=2, layers=[512,256])

learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(show_moms=True, suggestion=True)

In [None]:
learning_rate = 1.32e-2
epochs = 1

In [None]:
start = time()
learn.fit_one_cycle(epochs,learning_rate)
t_final = time() - start 

In [None]:
del learn 
del model
del databunch
torch.cuda.empty_cache() 

#### 