# Load Libraries

In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 2

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
# NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

In [3]:
def read_file(text_embedding_path, image_embedding_path, path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_csv(path, columns=usecols)
    else: df = cudf.read_csv(path)
    text_embeddings = np.load(text_embedding_path)
    image_embeddings = np.load(image_embedding_path) 
    img_emb, text_emb = pd.DataFrame(image_embeddings), pd.DataFrame(text_embeddings)
    net_emb = pd.concat([img_emb, text_emb], axis = 1)
    net_emb.columns = range(len(net_emb.columns))
    net_emb = cudf.from_pandas(net_emb)
    concat_pd = cudf.concat([df, net_emb], axis = 1)
    features_to_drop = ['id','date', 'media', 'content_processed', 'Link','image_path', 'username', 'inferred company','Media Type', 'Year']
    df = concat_pd.drop(columns = features_to_drop, inplace=False)
    print('shape of data:', df.shape)
    return df

print('Reading train data...')
TRAIN_PATH = '/kaggle/input/valid-dataset-adobe/valid_paths_data.csv'
IMG_EMB = '/kaggle/input/clipx32-batch-mpnetx8-batch/CLIP_Embeds.npy'
TEXT_EMB = '/kaggle/input/clipx32-batch-mpnetx8-batch/MPNET_Embeds.npy'
train = read_file(TEXT_EMB, IMG_EMB, path = TRAIN_PATH)

Reading train data...
shape of data: (295502, 1796)


In [4]:
# train = train.iloc[:295502:20]

In [5]:
train['likes'] = cupy.log(train['likes'] + 1e-10)
train

Unnamed: 0,likes,hour,month,day of week,0,1,2,3,4,5,...,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791
0,1.000000e-10,0,12,12,-0.246861,-0.199877,0.156696,0.243754,-0.125172,0.529467,...,0.001865,-0.016018,0.072187,0.005552,0.000850,0.042931,-0.003657,0.011115,-0.034018,-0.007277
1,7.919356e+00,10,6,30,-0.398804,-0.299322,-0.262163,0.024715,0.118844,0.304803,...,0.012844,0.045818,0.003134,0.006323,0.015455,0.047594,-0.005021,0.002013,-0.035605,0.008801
2,4.043051e+00,19,9,29,-0.091366,-0.176465,-0.028901,0.074262,0.259798,0.105130,...,0.025636,0.016661,-0.018063,0.026398,0.003947,0.027403,-0.004428,0.035896,-0.074444,-0.020568
3,5.023881e+00,11,10,1,0.536553,-1.086032,-0.033425,0.178087,-0.201594,0.012495,...,-0.027295,-0.022234,0.037092,-0.018010,-0.008328,-0.060657,0.036714,-0.029977,0.001185,-0.008736
4,3.713572e+00,14,10,19,-0.383302,-0.929385,-0.222161,-0.152471,0.040520,-0.104436,...,-0.073161,0.031506,0.002639,0.050263,-0.040711,-0.022616,-0.011846,0.014554,0.034135,-0.011808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295497,-2.302585e+01,16,9,7,0.327817,-0.784044,-0.088061,-0.219255,-0.257949,-0.323234,...,0.017004,0.002473,0.009592,0.011323,-0.039135,-0.038760,0.016228,-0.038038,-0.011835,-0.001271
295498,3.828641e+00,11,2,23,0.116493,-0.262871,0.106194,0.574954,-0.135060,-0.067277,...,-0.019538,0.014360,0.017379,-0.019374,-0.011795,-0.027759,0.004184,0.019091,0.053744,0.003318
295499,5.564520e+00,20,11,11,-0.302513,-0.420806,0.207896,-0.177037,0.027661,0.334599,...,0.006432,0.011452,0.040904,-0.006894,-0.039816,0.009070,-0.002368,0.045965,-0.030693,0.000339
295500,4.779123e+00,10,10,29,-0.180378,-0.354582,-0.260322,-0.182210,-0.304582,0.390902,...,-0.025157,-0.000011,-0.043119,0.020535,-0.016441,0.071141,-0.003668,0.053085,0.012343,0.007963


In [6]:
# # ADD TARGETS
# targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
# targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
# targets = targets.set_index('customer_ID')
# train = train.merge(targets, left_index=True, right_index=True, how='left')
# train.target = train.target.astype('int8')
# del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
# train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:]
targets = train.columns[0]
print(f'There are {len(FEATURES)} features!')
print(f'This is {targets} Target!')

There are 1795 features!
This is likes Target!


In [7]:
FEATURES

Index([       'hour',       'month', 'day of week',             0,
                   1,             2,             3,             4,
                   5,             6,
       ...
                1782,          1783,          1784,          1785,
                1786,          1787,          1788,          1789,
                1790,          1791],
      dtype='object', length=1795)

# Train XGB
We will train using `DeviceQuantileDMatrix`. This has a very small GPU memory footprint.

In [8]:
# LOAD XGB LIBRARY
from sklearn.model_selection import KFold
import xgboost as xgb

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth': 8, 
    'learning_rate':0.01, 
    'subsample':0.8,
    'colsample_bytree':0.6,
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED,
    'lambda': 0.01,
}

In [9]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [10]:

if isinstance(train, pd.DataFrame):
    print("It's a pandas DataFrame.")
else:
    print("It's not a pandas DataFrame.")

# Check if it's a cuDF DataFrame
if isinstance(train, cudf.DataFrame):
    print("It's a cuDF DataFrame.")
else:
    print("It's not a cuDF DataFrame.")


It's not a pandas DataFrame.
It's a cuDF DataFrame.


In [11]:
train.target = train.likes

In [12]:
import pandas
import cudf
from sklearn.metrics import mean_squared_error
from math import sqrt
importances = []
oof = []

train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 0.7
gc.collect()

skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.likes )):
    
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    # TRAIN, VALID, TEST FOR FOLD K
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'likes')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'likes']
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # TRAIN MODEL FOLD K
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=1999,
                early_stopping_rounds=100,
                verbose_eval=100) 
    
    model.save_model(f'XGB_v{VER}_fold{fold}.xgb')
    
#     # GET FEATURE IMPORTANCE FOR FOLD K
#     dd = model.get_score(importance_type='weight')
#     df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
#     importances.append(df)
            
    # INFER OOF FOLD K
    oof_preds = model.predict(dvalid)
#     y_valid.values = np.power(2,y_valid.values)
#     oof_preds = np.power(2, oof_preds)
    acc = sqrt(mean_squared_error(y_valid.values, oof_preds))
    print('RMSE =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[valid_idx, ['likes'] ].copy()
    df['predictions'] = oof_preds
    oof.append( df )
    
    del dtrain, Xy_train, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#'*25)
# oof = pd.concat(oof,axis=0,ignore_index=True).set_index('likes')
# acc = sqrt(mean_squared_error(oof.likes.values, oof.oof_pred.values))
# print('OVERALL CV Kaggle Metric =',acc)

#########################
### Fold 1
### Train size 165480 Valid size 59101
### Training with 70% fold data...
#########################
[0]	train-rmse:9.58322	valid-rmse:9.48694
[100]	train-rmse:6.71918	valid-rmse:6.89055
[200]	train-rmse:5.68581	valid-rmse:6.09412
[300]	train-rmse:5.18736	valid-rmse:5.80108
[400]	train-rmse:4.87693	valid-rmse:5.67084
[500]	train-rmse:4.64771	valid-rmse:5.59952
[600]	train-rmse:4.45746	valid-rmse:5.55421
[700]	train-rmse:4.28933	valid-rmse:5.52222
[800]	train-rmse:4.13454	valid-rmse:5.49692
[900]	train-rmse:3.99332	valid-rmse:5.47833
[1000]	train-rmse:3.86346	valid-rmse:5.46316
[1100]	train-rmse:3.74545	valid-rmse:5.45084
[1200]	train-rmse:3.63383	valid-rmse:5.44060
[1300]	train-rmse:3.52945	valid-rmse:5.43175
[1400]	train-rmse:3.42767	valid-rmse:5.42415
[1500]	train-rmse:3.33518	valid-rmse:5.41765
[1600]	train-rmse:3.24852	valid-rmse:5.41157
[1700]	train-rmse:3.16479	valid-rmse:5.40563
[1800]	train-rmse:3.08754	valid-rmse:5.40060
[1900]	train-rmse:3.

In [13]:
# CLEAN RAM
del train
_ = gc.collect()