# Imports

In [7]:
import os
import zipfile
import joblib
import pandas as pd

import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler

from src.config import PATH, DATA_DIR, MODELS_DIR
from src.data.create_dataset import Dataset, DatasetComplete
from src.data.process_data import (save_processed_datasets_to_feather,
                                   load_processed_datasets_from_feather)

ZIP_PATH = DATA_DIR / 'purchaseprediction_fulldata.zip'
INTERIM_DATA_DIR = DATA_DIR / 'interim'
SEED = 42

GPU_AVAILABLE = False
LOAD_PRETRAINED = True

# Unzip the Data

With the zipped data "purchaseprediction_fulldata.zip" in the ProductPrediction/data directory

In [None]:
raw_fnames = [
    'customers','labels_predict',
    'labels_training','products',
    'purchases','views']

# check if file has already been unzipped
if all(os.path.exists(DATA_DIR/'raw'/f'{fname}.txt') for fname in raw_fnames):
    print('already unzipped')

# extract zipfile to 'raw'
else:
    assert ZIP_PATH.exists()
    # create directory if needed
    if not os.path.exists(DATA_DIR/'raw'):
        os.mkdir(DATA_DIR/'raw')
    # extract
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR / 'raw')

# Create and Save Training Dataset

Preprocess the raw datasets, imputing missing values and reducing data usage. Then save all to feather format for faster loading.

In [6]:
# if feather files alreeady exist then skip
if all(os.path.exists(INTERIM_DATA_DIR/f'{fname}.feather') for fname in raw_fnames):
    print('Processed feathers already exist')

# else create and save
else:
    save_processed_datasets_to_feather()

To build the final model we train on the full dataset without cross validation.

In [None]:
# initialise dataset object
data = Dataset(random_seed=SEED)

train_fpath = DATA_DIR/'processed'/f'train.feather'
valid_fpath = DATA_DIR/'processed'/f'valid.feather'

# if data has already been processed, load from file
if train_fpath.exists() & valid_fpath.exists():
    data.load_datasets_from_file(train_fpath, valid_fpath)

# otherwise create the datasets
# Load the feather datasets and merge
# compute engineered features and apply to both train and test sets
else:
    (customer_df,
     product_df,
     purchase_df,
     views_df,
     labels_training_df,
     labels_testing_df) = load_processed_datasets_from_feather()
    
    data.create_train_valid_datasets(
        labels_training_df, labels_testing_df,
        customer_df, purchase_df, product_df, views_df)

    # save the resulting datasets
    data.save_datasets(train_fpath, valid_fpath)
    
    del (labels_training_df, labels_testing_df, customer_df, purchase_df, product_df, views_df)

# Train Best Model

To combat class imbalance we use Random Over Sampling on the training data. We do not need to standardise our data as XGBoost is tree based, so is not affected by feature scale.

In [None]:
if not LOAD_PRETRAINED:
    # for probability calibration
    # % true class in original dataset (0.02)
    true_ratio = data.train.purchased.mean()

    # resample
    sampler = RandomOverSampler(random_state=SEED)
    data.train, _ = sampler.fit_resample(data.train, data.train.purchased)

    # for probability calibration
    # % true class in resampled dataset (0.5)
    wrong_ratio = data.train.purchased.mean()

Create and train a model using the best hyperparameters found.

In [None]:
if not LOAD_PRETRAINED:
    params = dict(
        learning_rate=0.01,
        n_estimators=2000,
        sampling_method='gradient_based' if GPU_AVAILABLE else 'uniform',
        eval_metric='logloss',
        subsample=0.8,
        tree_method='gpu_hist' if GPU_AVAILABLE else 'hist',
        colsample_bynode=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        reg_alpha=100,
        max_depth=6)

    model = xgb.XGBClassifier(**params)

    model.fit(
        data.train.drop(columns=['purchased']),
        data.train.purchased)

Optionally save this trained model.

In [None]:
# save_path = MODELS_DIR/'trained_xgboost.joblib'
# joblib.dump(model, save_path)

# Load a Pretrained Model

Alternatively load the already trained model.

In [None]:
if LOAD_PRETRAINED:
    model = joblib.load(MODELS_DIR/'final_xgboost.joblib')

# Make Predictions

Use the fitted model to make predictions on the testing set.

In [None]:
# making predictions for the test set probabilities
y_pred = model.predict_proba(data.valid.drop(columns=['purchase_probability']))[:,1]

Since we are predicting probabilities and not classes, we need to calibrate our model outputs. [Derivation of class adjustment formula](/ProductRecommendation/reports/Adjustment%20formula%20derivation.ipynb)

In [None]:
def classadjust(condprobs,wrong_ratio,true_ratio):
    """Adjust predicted probabilities - calibration"""
    a = condprobs/(wrong_ratio/true_ratio)
    comp_cond = 1 - condprobs
    comp_wrong = 1 - wrong_ratio
    comp_true = 1 - true_ratio
    b = comp_cond/(comp_wrong/comp_true)
    return a/(a+b)

if LOAD_PRETRAINED:
    true_ratio = 0.020212026484729476
    wrong_ratio = 0.5

y_pred_adj = classadjust(y_pred, wrong_ratio, true_ratio)

Now we have our predictions, we can output our results to a file.

In [None]:
submission = pd.read_feather(DATA_DIR/'interim'/'labels_predict.feather')
submission.purchase_probability = y_pred_adj

submission.to_csv('SUBMISSION.csv', index=False)

# Cross Validation (optional)

If doing cross validation for model comparison the data in each fold needs to be processed separately.

```python
n_folds = 3

# create dataset - load preprocessed from files
data = DatasetComplete(random_seed=SEED)
data.load_datasets()
data.assign_folds(load_from_path=DATA_DIR/'interim'/f'cv_folds_{n_folds}.npy')
data.load_nfolds_from_files(n_folds=n_folds, save_filepath=DATA_DIR/'processed'/'Extra features')

# resample all the folds individually - using sampler
sampler = RandomOverSampler(random_state=SEED)
for i in range(n_folds):
    data.folds_data[i].train, _ = sampler.fit_resample(
        data.folds_data[i].train, data.folds_data[i].train.purchased)
```

We then train a model on each fold - which can be accessed through <code>data.folds_data[i].train</code>