In [None]:
import os
GPU_id = 5
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [None]:
import numpy as np
import nvstrings, nvcategory
import warnings
import cudf
import torch

from fastai import *
from fastai.basic_data import *
from librmm_cffi import librmm
from fastai_modified.core_cudf import *
from torch.utils.dlpack import from_dlpack
from cuml.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")
%matplotlib inline
%reload_ext snakeviz

In [None]:
def _enforce_str(y: cudf.Series) -> cudf.Series:
    """
    Ensure that nvcategory is being given strings
    """
    if y.dtype != "object":
        return y.astype("str")
    return y

class LabelEncoder(object):

    def __init__(self, *args, **kwargs):
        self._cats: nvcategory.nvcategory = None
        self._dtype = None
        self._fitted: bool = False

    def _check_is_fitted(self):
        if not self._fitted:
            raise TypeError("Model must first be .fit()")

    def fit(self, y: cudf.Series) -> "LabelEncoder":
        self._dtype = y.dtype

        y = _enforce_str(y)

        self._cats = nvcategory.from_strings(y.data)
        self._fitted = True
        return self

    def transform(self, y: cudf.Series) -> cudf.Series:
        self._check_is_fitted()
        y = _enforce_str(y)
        encoded = cudf.Series(
            nvcategory.from_strings(y.data)
                .set_keys(self._cats.keys())
                .values()
        )
        return encoded.replace(-1, 0)

    def fit_transform(self, y: cudf.Series) -> cudf.Series:
        self._dtype = y.dtype

        # Convert y to nvstrings series, if it isn't one
        y = _enforce_str(y)

        # Bottleneck is here, despite everything being done on the device
        self._cats = nvcategory.from_strings(y.data)

        self._fitted = True
        arr: librmm.device_array = librmm.device_array(
            y.data.size(), dtype=np.int32
        )
        self._cats.values(devptr=arr.device_ctypes_pointer.value)
        return cudf.Series(arr)

    def inverse_transform(self, y: cudf.Series) -> cudf.Series:
        raise NotImplementedError

In [None]:
class TabularProc():
    def process_df(self, gdf: cudf.DataFrame, train: bool, cat_names: list, cont_names: list):
        raise NotImplementedError

class Normalize(TabularProc):
    "Normalize the continuous variables."
    means, stds = {}, {}
    def process_df(self, gdf: cudf.DataFrame, train: bool, cat_names: list, cont_names: list):
        if train:
            self.means.update({name: mean for name, mean in zip(cont_names, gdf[cont_names].mean())})
            self.stds.update({name: mean for name, mean in zip(cont_names, gdf[cont_names].std())})
        for name in cont_names:
            gdf[name] = (gdf[name] - self.means[name]) / (1e-7 + self.stds[name])
            gdf[name] = gdf[name].astype('float32')

class FillMissing(TabularProc):
    MEDIAN = "median"
    CONSTANT = "constant"
    def __init__(self, fill_strategy=MEDIAN, fill_val=0, add_col=True):
        self.fill_strategy = fill_strategy
        self.fill_val = fill_val
        self.add_col = add_col
        self.train_cont_names_na = []
        self.filler = {}

    def process_df(self, gdf: cudf.DataFrame, train: bool, cat_names: list, cont_names: list):
        na_names = [name for name in cont_names if gdf[name].isna().sum()]
        if len(na_names) == 0: return
        cur_filler = {}
        if train:
            self.train_cont_names_na.extend(na_names)
            if self.fill_strategy == self.MEDIAN:
                cur_filler = {name: self.get_median(gdf[name]) for name in na_names}
            elif self.fill_strategy == self.CONSTANT:
                cur_filler = {name: self.fill_val for name in na_names}
            else:
                cur_filler = {name: gdf[name].value_counts().index[0] for name in na_names}
            self.filler.update(cur_filler)
        elif not set(na_names).issubset(set(self.train_cont_names_na)):
            raise Exception(f"""There are nan values in field {na_names} but there were none 
            in the training set. Please fix those manually.""")
        if self.add_col: self.add_na_indicators(gdf, na_names, cat_names)
        gdf[na_names].fillna(cur_filler, inplace=True)

    def add_na_indicators(self, gdf: cudf.DataFrame, na_names, cat_names):
        for name in na_names:
            name_na = name + "_na"
            gdf[name_na] = gdf[name].isna()
            if name_na not in cat_names: cat_names.append(name_na)

    def get_median(self, col: cudf.Series):
        col = col.dropna().reset_index(drop=True).sort_values()
        return col[len(col)//2]

class Categorify(TabularProc):
    "Transform the categorical variables to that type."
    category_encoders = {}
    embed_sz = {}
    cat_names = []
    def process_df(self, gdf: cudf.DataFrame, train: bool, cat_names: list, cont_names: list):
        self.cat_names.extend(cat_names)
        for name in cat_names:
            if train:
                le = LabelEncoder()
                self.category_encoders[name] = le
                gdf[name] = le.fit_transform(gdf[name].append([None]))[:-1]
            else:
                gdf[name] = self.category_encoders[name].transform(gdf[name].append([None]))[:-1]
            gdf[name] = gdf[name].astype('int64')

    def get_emb_sz(self):
        work_in = {}
        for key, val in self.category_encoders.items():
            work_in[key] = len(val._cats.keys()) + 1
        ret_list = [self.def_emb_sz(work_in, n) for n in self.cat_names]
        return ret_list

    def emb_sz_rule(self, n_cat: int) -> int:
        return min(600, round(1.6 * n_cat ** 0.56))

    def def_emb_sz(self, classes, n, sz_dict=None):
        "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
        sz_dict = ifnone(sz_dict, {})
        n_cat = classes[n]
        sz = sz_dict.get(n, int(self.emb_sz_rule(n_cat)))  # rule of thumb
        self.embed_sz[n] = sz
        return n_cat, sz

In [None]:
class Preprocess():
    def __init__(self, cat_names, cont_names, label_name, preprocessors: list, to_cpu=True):
        self.cat_names, self.cont_names, self.label_name = cat_names, cont_names, label_name
        self.to_cpu = to_cpu
        self.preprocessors = preprocessors
        self.cpu = torch.device("cpu")
        self.cats, self.conts, self.label = [], [], []

    def preproc_files(self, files, train=True):
        for file in files:
            gdf = cudf.read_parquet(file)
            gdf_cat_names = [n for n in gdf.columns if n in self.cat_names]
            gdf_cont_names = [n for n in gdf.columns if n in self.cont_names]
            if is_listy(self.label_name):
                gdf_label_names = [n for n in gdf.columns if n in self.label_name] 
            elif self.label_name in gdf.columns:
                gdf_label_names = [self.label_name]
            else:
                gdf_label_names = []
            for proc in self.preprocessors:
                if not isinstance(proc, TabularProc):
                    raise Exception(f"{proc} is not a valid tabular processor")
                proc.process_df(gdf, train, gdf_cat_names, gdf_cont_names)
            for n in gdf_label_names: 
                gdf[n] = gdf[n].astype('float32')
            gdf_cats, gdf_conts, gdf_label = gdf[gdf_cat_names], gdf[gdf_cont_names], gdf[gdf_label_names]
            del gdf
            self.to_tensor(gdf_cats, torch.long, self.cats)
            self.to_tensor(gdf_conts, torch.float32, self.conts)
            self.to_tensor(gdf_label, torch.float32, self.label)
            del gdf_cats, gdf_conts, gdf_label
        cats = torch.cat(self.cats, dim=1) if self.cats
        conts = torch.cat(self.conts, dim=1) if self.conts
        label = torch.cat(self.label, dim=1) if self.label
        return (cats, conts), label

    def to_tensor(self, gdf: cudf.DataFrame, dtype, tensor_list):
        if gdf.shape[0] == 0: return
        t = from_dlpack(gdf.to_dlpack()).type(dtype)
        t = t.unsqueeze(1) if gdf.shape[1] == 1 else t
        t = t.to(self.cpu) if self.to_cpu else t
        tensor_list.append(t)

<h2> <center> Pre processing </center> </h2>

In [None]:
# test = cudf.read_parquet('test.parquet')
# idxs = torch.randperm(len(test.columns))
# j = 0
# files = []
# strid = 20
# for i in range(0, test.shape[1], strid):
#     sub_columns = test.columns[idxs[i:i+stride]]
#     file = f'cache/train_sub_{j}.parquet'
#     test[sub_columns].to_pandas().to_parquet(file)
#     files.append(file)
#     j += 1

In [None]:
with open('cache/col_names.pkl', 'rb') as f: col_names = pickle.load(f)
cat_names, cont_names = col_names['cat_names'], col_names['cont_names']

In [None]:
proc_list = [FillMissing(), Normalize(), Categorify()]
preprocessor = Preprocess(cat_names=cat_names, cont_names=cont_names, label_name='target', preprocessors=proc_list)

In [None]:
%%time
files = ['train.parquet']
(cats, conts), label = preprocessor.preproc_files(files)