In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import sys
sys.path.insert(1, '../')

In [3]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import pyarrow.parquet as pq


import cudf

from nv_tabular.preproc import Workflow
from nv_tabular.ops import Normalize, FillMissing, Categorify, Moments, Median, Encoder, LogOp, ZeroFill
from nv_tabular.dl_encoder import DLLabelEncoder
from nv_tabular.ds_iterator import GPUDatasetIterator
from nv_tabular.batchloader import FileItrDataset, DLCollator, DLDataLoader
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [4]:
torch.__version__, cudf.__version__

('1.5.0.dev20200224', '0.13.0a+2514.g446009365')

In [5]:
%load_ext snakeviz
# load snakeviz if you want to run profiling

In [6]:
# to_cpu = True

<h3> Dataset Gathering: Define files in the training and validation datasets. </h3>

In [7]:
# data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
data_path = '/datasets/ashrae'
#df_test = 'test/'
df_valid = ''
df_train = ''

In [8]:
build_meta = cudf.read_csv(f'{data_path}/building_metadata.csv')
train_w = cudf.read_csv(f'{data_path}/weather_train.csv')
test_w = cudf.read_csv(f'{data_path}/weather_test.csv')

In [9]:
cols = train_w.columns

In [10]:
train_w.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [11]:
data_path = './ashrae_merge/'
ds_train = 'train'
ds_valid = 'valid'
path_data_train = os.path.join(data_path, ds_train)
path_data_valid = os.path.join(data_path, ds_valid)

In [12]:
build_trw = cudf.merge(build_meta, train_w, on=['site_id'], how='left')

In [14]:
pq.write_table(build_trw.to_arrow(), os.path.join(path_data_train, 'train.parquet'))

In [15]:
build_tew = cudf.merge(build_meta, test_w, on=['site_id'], how='left')

In [17]:
pq.write_table(build_tew.to_arrow(), os.path.join(path_data_valid, 'valid.parquet'))

In [18]:
build_trw.shape, build_tew.shape

((12676166, 14), (25273986, 14))

In [19]:
train_set = [os.path.join(path_data_train, x) for x in os.listdir(path_data_train) if x.endswith("parquet")] 
valid_set = [os.path.join(path_data_valid, x) for x in os.listdir(path_data_valid) if x.endswith("parquet")] 

In [20]:
len(train_set), len(valid_set)

(1, 1)

In [21]:
train_set

['./ashrae_merge/train/train.parquet']

<h4>Grab column information</h4>

In [22]:
cols = build_trw.columns

In [23]:
cols

Index(['site_id', 'building_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count', 'timestamp', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed'],
      dtype='object')

In [None]:
cat_names = ['site_id', 'building_id', 'primary_use', 'cloud_coverage' ]
cont_names =  ['dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_speed', 'floor_count']
cat_names = [name for name in cat_names if name in cols]
cont_names = [name for name in cont_names if name in cols]
label_names = ['meter_reading']

In [None]:
cont_names, cat_names, label_names, cols

<h3>Preprocessing:</h3> <p>Select operations to perform, create the Preprocessor object, create dataset iterator object and collect the stats on the training dataset</p>

In [None]:
%%time
proc = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=['label'], stat_ops=[Moments(), Median(), Encoder()], to_cpu=to_cpu)

In [None]:
%%time
proc.add_preprocess(FillMissing())
proc.add_preprocess(Normalize())
proc.add_preprocess(Categorify())

In [None]:
%%time
trains_itrs = GPUDatasetIterator(train_set,names=cols, engine='csv')

In [None]:
%%time
proc.update_stats(trains_itrs)

In [None]:
proc.stats

<h5>Gather embeddings using statistics gathered in the Read phase.</h5>

In [None]:
embeddings = [x[1] for x in proc.df_ops['Categorify'].get_emb_sz(proc.stats["categories"], proc.columns_ctx['categorical']['base'])]

<h5>Create the file iterators using the FileItrDataset Class.</h5>

In [None]:
%%time
t_batch_sets = [FileItrDataset(x, names=cols, engine='csv', batch_size=400000) for x in train_set]
v_batch_sets = [FileItrDataset(x, names=cols, engine='csv', batch_size=400000) for x in valid_set]

In [None]:
%%time
t_chain = torch.utils.data.ChainDataset(t_batch_sets)
v_chain = torch.utils.data.ChainDataset(v_batch_sets)

<h5>Use the Deep Learning Collator to create a collate function to pass to the dataloader.</h5>

In [None]:
%%time
dlc = DLCollator(preproc=proc)

In [None]:
%%time
t_data = DLDataLoader(t_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)
v_data = DLDataLoader(v_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)

<h4>After creating the Dataloaders you can leverage fastai framework to create Machine Learning models</h4>

In [None]:
databunch = DataBunch(t_data, v_data, collate_fn=dlc.gdf_col, device="cuda")

In [None]:
%%time
model = TabularModel(emb_szs = embeddings, n_cont=len(cont_names), out_sz=2, layers=[512,256])

learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(show_moms=True, suggestion=True)

In [None]:
learning_rate = 1.32e-2
epochs = 1

In [None]:
start = time()
learn.fit_one_cycle(epochs,learning_rate)
t_final = time() - start 

In [None]:
del learn 
del model
del databunch
torch.cuda.empty_cache() 