# Train XGBoost on 256 PCA components

In [1]:
from os.path import join

import xgboost as xgb
import numpy as np
import dask.dataframe as dd

In [2]:
DATA_PATH = '/mnt/dssmcmlfs01/merlin_cxg_2023_05_15_sf-log1p'

In [3]:
x_train = np.load(join(DATA_PATH, 'pca/x_pca_training_train_split_256.npy'))
y_train = dd.read_parquet(join(DATA_PATH, 'train'), columns='cell_type').compute().to_numpy()

x_val = np.load(join(DATA_PATH, 'pca/x_pca_training_val_split_256.npy'))
y_val = dd.read_parquet(join(DATA_PATH, 'val'), columns='cell_type').compute().to_numpy()

class_weights = np.load(join(DATA_PATH, 'class_weights.npy'))

In [4]:
class_weights = {i: weight for i, weight in enumerate(np.load(join(DATA_PATH, 'class_weights.npy')))}
weights = np.array([class_weights[label] for label in y_train])

In [None]:
clf = xgb.XGBClassifier(
    tree_method='gpu_hist',
    gpu_id=0,
    n_estimators=1000,
    eta=0.075,
    subsample=0.75,
    max_depth=10,
    n_jobs=20,
    early_stopping_rounds=10
)
clf = clf.fit(
    x_train, y_train, sample_weight=weights, 
    eval_set=[(x_val, y_val)]
)

In [6]:
clf.save_model('model.json')