# Load Libraries

In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 2

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
# NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

In [3]:
def read_file(text_embedding_path, image_embedding_path, path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_csv(path, columns=usecols)
    else: df = cudf.read_csv(path)
    text_embeddings = np.load(text_embedding_path)
    image_embeddings = np.load(image_embedding_path) 
    img_emb, text_emb = pd.DataFrame(image_embeddings), pd.DataFrame(text_embeddings)
    net_emb = pd.concat([img_emb, text_emb], axis = 1)
    net_emb.columns = range(len(net_emb.columns))
    net_emb = cudf.from_pandas(net_emb)
    concat_pd = cudf.concat([df, net_emb], axis = 1)
    features_to_drop = ['date', 'media', 'content_processed', 'Link','image_path', 'username', 'inferred company','Media Type']
    df = concat_pd.drop(columns = features_to_drop, inplace=False)
    print('shape of data:', df.shape)
    return df

print('Reading train data...')
TRAIN_PATH = '/kaggle/input/valid-dataset-adobe/valid_paths_data.csv'
IMG_EMB = '/kaggle/input/clipx32-batch-mpnetx8-batch/CLIP_Embeds.npy'
TEXT_EMB = '/kaggle/input/clipx32-batch-mpnetx8-batch/MPNET_Embeds.npy'
train = read_file(TEXT_EMB, IMG_EMB, path = TRAIN_PATH)

Reading train data...
shape of data: (295502, 1798)


In [4]:
train['likes'] = cupy.log(train['likes'] + 1e-10)
train

Unnamed: 0,id,likes,hour,month,day of week,Year,0,1,2,3,...,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791
0,1,1.000000e-10,0,12,12,2020,-0.246861,-0.199877,0.156696,0.243754,...,0.001865,-0.016018,0.072187,0.005552,0.000850,0.042931,-0.003657,0.011115,-0.034018,-0.007277
1,2,7.919356e+00,10,6,30,2018,-0.398804,-0.299322,-0.262163,0.024715,...,0.012844,0.045818,0.003134,0.006323,0.015455,0.047594,-0.005021,0.002013,-0.035605,0.008801
2,3,4.043051e+00,19,9,29,2020,-0.091366,-0.176465,-0.028901,0.074262,...,0.025636,0.016661,-0.018063,0.026398,0.003947,0.027403,-0.004428,0.035896,-0.074444,-0.020568
3,4,5.023881e+00,11,10,1,2020,0.536553,-1.086032,-0.033425,0.178087,...,-0.027295,-0.022234,0.037092,-0.018010,-0.008328,-0.060657,0.036714,-0.029977,0.001185,-0.008736
4,5,3.713572e+00,14,10,19,2018,-0.383302,-0.929385,-0.222161,-0.152471,...,-0.073161,0.031506,0.002639,0.050263,-0.040711,-0.022616,-0.011846,0.014554,0.034135,-0.011808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295497,299996,-2.302585e+01,16,9,7,2019,0.327817,-0.784044,-0.088061,-0.219255,...,0.017004,0.002473,0.009592,0.011323,-0.039135,-0.038760,0.016228,-0.038038,-0.011835,-0.001271
295498,299997,3.828641e+00,11,2,23,2018,0.116493,-0.262871,0.106194,0.574954,...,-0.019538,0.014360,0.017379,-0.019374,-0.011795,-0.027759,0.004184,0.019091,0.053744,0.003318
295499,299998,5.564520e+00,20,11,11,2020,-0.302513,-0.420806,0.207896,-0.177037,...,0.006432,0.011452,0.040904,-0.006894,-0.039816,0.009070,-0.002368,0.045965,-0.030693,0.000339
295500,299999,4.779123e+00,10,10,29,2019,-0.180378,-0.354582,-0.260322,-0.182210,...,-0.025157,-0.000011,-0.043119,0.020535,-0.016441,0.071141,-0.003668,0.053085,0.012343,0.007963


In [5]:
# FEATURES
FEATURES = train.columns[1:]
targets = train.columns[1]
print(f'There are {len(FEATURES)} features!')
print(f'This is {targets} Target!')

There are 1797 features!
This is likes Target!


# Train XGB
We will train using `DeviceQuantileDMatrix`. This has a very small GPU memory footprint.

In [6]:
# LOAD XGB LIBRARY
from sklearn.model_selection import KFold
import xgboost as xgb

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth': 3, 
    'learning_rate':0.01, 
    'subsample':0.8,
    'colsample_bytree':0.6,
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

In [7]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        """
        Custom iterator for loading data into a DeviceQuantileDMatrix in XGBoost.

        Args:
        - df (pd.DataFrame): DataFrame containing the data.
        - features (list): List of feature column names.
        - target (str): Name of the target column.
        - batch_size (int): Size of each batch (default is 256 * 1024).
        """
        self.features = features
        self.target = target
        self.df = df
        self.it = 0  # set iterator to 0
        self.batch_size = batch_size
        self.batches = int(np.ceil(len(df) / self.batch_size))
        super().__init__()

    def reset(self):
        '''Reset the iterator.'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0  # Return 0 when there's no more batch.

        a = self.it * self.batch_size
        b = min((self.it + 1) * self.batch_size, len(self.df))
        dt = cudf.DataFrame(self.df.iloc[a:b])
        
        # Provide the next batch of data to the input_data callback
        input_data(data=dt[self.features], label=dt[self.target])  #, weight=dt['weight'])
        
        self.it += 1
        return 1


In [8]:

if isinstance(train, pd.DataFrame):
    print("It's a pandas DataFrame.")
else:
    print("It's not a pandas DataFrame.")

# Check if it's a cuDF DataFrame
if isinstance(train, cudf.DataFrame):
    print("It's a cuDF DataFrame.")
else:
    print("It's not a cuDF DataFrame.")


It's not a pandas DataFrame.
It's a cuDF DataFrame.


In [9]:
train.target = train.likes

In [10]:
import pandas
import cudf
from sklearn.metrics import mean_squared_error
from math import sqrt
importances = []
oof = []

# Convert cudf DataFrame to pandas DataFrame
train = train.to_pandas()  # free GPU memory

# Define the subsample ratio for training
TRAIN_SUBSAMPLE = 1.0

# Perform garbage collection to free up GPU memory
gc.collect()

# Initialize KFold with the specified number of folds
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

# Loop through each fold in the KFold cross-validation
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.likes)):

    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)

    print('#' * 25)
    print('### Fold', fold + 1)
    print('### Train size', len(train_idx), 'Valid size', len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE * 100)}% fold data...')
    print('#' * 25)

    # TRAIN, VALID, TEST FOR FOLD K
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'likes')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'likes']

    # Create a DeviceQuantileDMatrix for training data
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    
    # Create a DMatrix for validation data
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

    # TRAIN MODEL FOLD K
    model = xgb.train(xgb_parms,
                      dtrain=dtrain,
                      


#########################
### Fold 1
### Train size 236401 Valid size 59101
### Training with 100% fold data...
#########################
[0]	train-rmse:9.51717	valid-rmse:9.43799
[100]	train-rmse:4.85754	valid-rmse:4.82223
[200]	train-rmse:2.57161	valid-rmse:2.55757
[300]	train-rmse:1.59334	valid-rmse:1.58748
[400]	train-rmse:1.04133	valid-rmse:1.03997
[500]	train-rmse:0.78613	valid-rmse:0.78681
[600]	train-rmse:0.62458	valid-rmse:0.62656
[700]	train-rmse:0.51216	valid-rmse:0.51497
[800]	train-rmse:0.44851	valid-rmse:0.45183
[900]	train-rmse:0.39707	valid-rmse:0.40053
[1000]	train-rmse:0.36377	valid-rmse:0.36747
[1100]	train-rmse:0.33849	valid-rmse:0.34240
[1200]	train-rmse:0.31780	valid-rmse:0.32189
[1300]	train-rmse:0.30216	valid-rmse:0.30622
[1400]	train-rmse:0.28908	valid-rmse:0.29316
[1500]	train-rmse:0.27833	valid-rmse:0.28235
[1600]	train-rmse:0.26895	valid-rmse:0.27290
[1700]	train-rmse:0.26085	valid-rmse:0.26471
[1800]	train-rmse:0.25334	valid-rmse:0.25721
[1900]	train-rmse:0

In [11]:
train.id

0              1
1              2
2              3
3              4
4              5
           ...  
295497    299996
295498    299997
295499    299998
295500    299999
295501    300000
Name: id, Length: 295502, dtype: int64

In [12]:
# CLEAN RAM
del train
_ = gc.collect()