In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)



RAPIDS version 23.08.00


In [2]:
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas
import cudf
from sklearn.metrics import mean_squared_error
from math import sqrt

In [3]:
def read_file(text_embedding_path, image_embedding_path, path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_csv(path, columns=usecols)
    else: df = cudf.read_csv(path)

    # Dictionary mapping days of the week to numbers
    day_to_number = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }

    # Convert the 'day_of_week' column to numbers
    df['day of week'] = df['day of week'].map(day_to_number)
    text_embeddings = np.load(text_embedding_path)
    image_embeddings = np.load(image_embedding_path) 
    img_emb, text_emb = pd.DataFrame(image_embeddings), pd.DataFrame(text_embeddings)
    net_emb = pd.concat([img_emb, text_emb], axis = 1)
    net_emb.columns = range(len(net_emb.columns))
    net_emb = cudf.from_pandas(net_emb)
    concat_pd = cudf.concat([df, net_emb], axis = 1)
    features_to_drop = ['date', 'media', 'content_processed', 'Link','image_path', 'username', 'inferred company','Media Type']
    df = concat_pd.drop(columns = features_to_drop, inplace=False)
    print('shape of data:', df.shape)
    return df

In [4]:
import warnings
print(f'Reading test data...')
TEST_PATH = '/kaggle/input/behav-company-img/dataframe/Behav_Company_img.csv'
IMG_EMB = '/kaggle/input/enet-embeds-test-company/ENet_Embeds_test_company.npy'
TEXT_EMB = '/kaggle/input/mpnet-company-dataset/MPNET_Embeds_Company.npy'
with warnings.catch_warnings():
    # Filter out the specific warning you want to ignore
    warnings.filterwarnings("ignore", category=Warning)
    test = read_file(TEXT_EMB, IMG_EMB, path = TEST_PATH)
customers = test[['id']].drop_duplicates().sort_index().values.flatten()

Reading test data...
shape of data: (10000, 1773)


In [5]:
# FEATURES
FEATURES = test.columns[:]
print(f'There are {len(FEATURES)} features!')

There are 1773 features!


In [6]:
VER = 1
FOLDS = 5

In [7]:
test_preds = []
# TEST DATA FOR XGB
X_test = test[FEATURES]
dtest = xgb.DMatrix(data=X_test)
del X_test
gc.collect()
# INFER XGB MODELS ON TEST DATA
model = xgb.Booster()
model.load_model(f'/kaggle/input/xgboost-saved-models/XGB_v{VER}_fold0.xgb')
preds = np.exp(model.predict(dtest))
for f in range(1,FOLDS):
    model.load_model(f'/kaggle/input/xgboost-saved-models/XGB_v{VER}_fold{f}.xgb')
    preds += np.exp(model.predict(dtest))
preds /= FOLDS
test_preds.append(preds)
# CLEAN MEMORY
del dtest, model
_ = gc.collect()

In [8]:
len(test_preds)

1

In [9]:
# WRITE SUBMISSION FILE
test_preds = np.concatenate(test_preds)
test = cudf.DataFrame(index=customers,data={'prediction':test_preds})

# DISPLAY PREDICTIONS
test.to_csv(f'task01_submission_xgb_v{VER}_Company_ENet.csv',index=False)
print('Submission file shape is', test.shape )
test.head(10000)

Submission file shape is (10000, 1)


Unnamed: 0,prediction
1,1.983737
2,6.404223
3,16.602894
4,57.068153
5,139.834198
...,...
9996,115365.000000
9997,121460.148438
9998,151062.234375
9999,148683.406250
