In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
import os
GPU_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [None]:
import sys
sys.path.insert(1, '../')

In [None]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf
import nv_tabular as nvt
from nv_tabular.ops import Normalize, FillMissing, Categorify, Moments, Median, Encoder, LogOp, ZeroFill
from nv_tabular.batchloader import FileItrDataset, DLCollator, DLDataLoader
import warnings

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#%load_ext snakeviz
# load snakeviz if you want to run profiling

In [None]:
# to_cpu = True

# <h3> Dataset Gathering: Define files in the training and validation datasets. </h3>

In [None]:

# # data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
# data_path = '/datasets/criteo/criteo_embed/'
# #df_test = 'test/'
# df_valid = ''
# df_train = ''
# split = 332
# end = 332
#train_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if  x.endswith('parquet') and int(x.split(".")[0]) in train_days] 
#valid_set = [data_path + df_valid + x for x in os.listdir(data_path + df_valid) if  x.endswith('parquet') and int(x.split(".")[0]) in valid_days]

In [None]:
# data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
# data_path = '/datasets/criteo/raw_csvs/split_train_data_parquet/'
data_path = '/home/oyilmaz/Documents/recsys_data/criteo_embed/'
#df_test = 'test/'
df_valid = ''
df_train = ''
start = 0
split = 3
fin = 5

train_days = ["day_" + str(x) for x in range(start, split)]
valid_days = ["day_" + str(x) for x in range(split, fin)]
print(train_days, valid_days)

train_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if x.endswith("parquet")][start:split]
valid_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if x.endswith("parquet")][split:fin]


In [None]:
len(train_set), len(valid_set)

<h4>Grab column information</h4>

In [None]:
cont_names = ['I' + str(x) for x in range(1,14)]
cat_names =  ['C' + str(x) for x in range(1,24)]
cat_names, cont_names

In [None]:
cols = ['label']  + cont_names + cat_names
cols

<h3>Preprocessing:</h3> <p>Select operations to perform, create the Preprocessor object, create dataset iterator object and collect the stats on the training dataset</p>

In [None]:
%%time
proc = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=['label'], to_cpu=to_cpu)

In [None]:
%%time
proc.add_cont_feature([ZeroFill(replace=True), LogOp(replace=True)])
proc.add_cont_preprocess(Normalize(replace=True))
proc.add_cat_preprocess(Categorify(replace=True, use_frequency=True, freq_threshold=15))

In [None]:
%%time
trains_itrs = nvt.dataset(train_set, names=cols, engine='parquet', gpu_memory_frac=0.1)
valids_itrs = nvt.dataset(valid_set, names=cols, engine='parquet', gpu_memory_frac=0.1)

In [None]:
output_train = '/home/oyilmaz/Documents/recsys_data/outputs/'
output_valid = '/home/oyilmaz/Documents/recsys_data/outputs/'

In [None]:
%%time 
proc.apply(trains_itrs, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=30)

In [None]:
%%time
#proc.apply(valids_itrs, apply_offline=True, record_stats=False, shuffle=True, output_path=output_valid, num_out_files=40)

In [None]:
new_train_set = [os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")]
new_valid_set = [os.path.join(output_valid, x) for x in os.listdir(output_valid) if x.endswith("parquet")]

<h5>Gather embeddings using statistics gathered in the Read phase.</h5>

In [None]:
embeddings = [x[1] for x in proc.df_ops['Categorify'].get_emb_sz(proc.stats["categories"], proc.columns_ctx['categorical']['base'])]

<h5>Create the file iterators using the FileItrDataset Class.</h5>

In [None]:
%%time
t_batch_sets = [FileItrDataset(x, names=cols, engine='parquet', batch_size=1000, sep="\t") for x in train_set]
v_batch_sets = [FileItrDataset(x, names=cols, engine='parquet', batch_size=10000, sep="\t") for x in valid_set]

In [None]:
%%time
t_chain = torch.utils.data.ChainDataset(t_batch_sets)
v_chain = torch.utils.data.ChainDataset(v_batch_sets)

<h5>Use the Deep Learning Collator to create a collate function to pass to the dataloader.</h5>

In [None]:
%%time
dlc = DLCollator(preproc=proc)

In [None]:
%%time
t_data = DLDataLoader(t_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)
v_data = DLDataLoader(v_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)

<h4>After creating the Dataloaders you can leverage fastai framework to create Machine Learning models</h4>

In [None]:
databunch = DataBunch(t_data, v_data, collate_fn=dlc.gdf_col, device="cuda")

In [None]:
%%time
model = TabularModel(emb_szs = embeddings, n_cont=len(cont_names), out_sz=2, layers=[512,256])

learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(show_moms=True, suggestion=True)

In [None]:
learning_rate = 1.32e-2
epochs = 1

In [None]:
start = time()
learn.fit_one_cycle(epochs,learning_rate)
t_final = time() - start 

In [None]:
del learn 
del model
del databunch
torch.cuda.empty_cache() 

#### 