In [1]:
import json
import numpy as np
import os
import pandas as pd
import sklearn
import sys

print(sys.version)
print(np.__version__)
print(sklearn.__version__)

3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
1.18.5
0.24.1


## LinTS GoodReads Recommendations

This notebook explores the differences in GoodReads recommendations across multiple NumPy environments. In particular, this notebook uses the preprocessed data generated in [Goodreads Preprocessing](Goodreads%20Preprocessing.ipynb) with some sampling for time constraints, as in [Goodreads Samples](Goodreads%20Samples.ipynb). All scenarios use LinTS for generating recommendations.

In [2]:
from datetime import datetime
import json
import pandas as pd
import numpy as np
import os
import platform
import pickle
from sklearn.preprocessing import StandardScaler
import sys

from mabwiser.mab import MAB
from mabwiser.linear import _RidgeRegression, _Linear

class LinTSExample(_RidgeRegression):
    def __init__(self, *args, random_option=None, rng2=None, arm=None):
        self.random_option = random_option
        self.rng2 = rng2
        self.arm = arm
        super().__init__(*args)

    def predict(self, x):
        if self.scaler is not None:
            x = self._scale_predict_context(x) 
        if self.random_option == 'cholesky':
            beta_sampled = self.rng2.multivariate_normal(self.beta, self.A_inv, method='cholesky')
        else:
            beta_sampled = self.rng2.multivariate_normal(self.beta, self.A_inv)
        return np.dot(x, beta_sampled)
    
class LinearExample(_Linear):
    factory = {"ts": LinTSExample}

    def __init__(self, rng, arms, n_jobs=1, backend=None, l2_lambda=1, alpha=1, regression='ts', arm_to_scaler = None,
                 random_option = None, rng2 = None):
        super().__init__(rng, arms, n_jobs, backend, l2_lambda, alpha, regression)
       
        self.l2_lambda = l2_lambda
        self.alpha = alpha
        self.regression = regression

        # Create ridge regression model for each arm
        self.num_features = None

        if arm_to_scaler is None:
            arm_to_scaler = dict((arm, None) for arm in arms)

        self.arm_to_model = dict((arm, LinearExample.factory.get(regression)(rng, l2_lambda, alpha, arm_to_scaler[arm],
                                                                             random_option=random_option, rng2=rng2, arm=arm)) for arm in arms)


def all_positive_definite(model):
    eigenval_all_positive = np.all([(np.linalg.eig(model.arm_to_model[k].A_inv)[0] > 0).all() for k in model.arm_to_model])
    is_symmetric = np.all([np.allclose(model.arm_to_model[k].A_inv, model.arm_to_model[k].A_inv.T) for k in model.arm_to_model])
    return eigenval_all_positive and is_symmetric


base_path = 'output'

# Dataset 1
users = pd.read_csv(os.path.join(base_path, 'sample_user_features.csv.gz'))
responses = pd.read_csv(os.path.join(base_path, 'sample_responses.csv.gz'))
train = users[users['set']=='train']
test = users[users['set']=='test']

train = train.merge(responses, how='left', on='user_id')

context_features = [c for c in users.columns if c not in ['user_id', 'set']]

decisions = MAB._convert_array(train['book_id'])
rewards = MAB._convert_array(train['response'])
contexts = MAB._convert_matrix(train[context_features]).astype('float')
test_contexts = MAB._convert_matrix(test[context_features]).astype('float')

scaler = StandardScaler()
contexts = scaler.fit_transform(contexts)
test_contexts = scaler.transform(test_contexts)
item_ids = list(responses['book_id'].unique())

def run_goodreads(random_option):
    np.random.seed(42)
    if random_option == 'randomstate':
        rng = np.random.RandomState(seed=11)
        rng2 = rng
    elif random_option == 'svd':
        rng = np.random.RandomState(seed=11)
        rng2 = np.random.default_rng(11)
    elif random_option == 'cholesky':
        rng = np.random.RandomState(seed=11)
        rng2 = np.random.default_rng(11)
    mab = LinearExample(rng=rng, arms=item_ids, l2_lambda=1, alpha=1, regression='ts', n_jobs=1, backend=None,
                        rng2=rng2, random_option=random_option)


    np.random.seed(42)
    mab.fit(decisions, rewards, contexts)
    print(f"All covariances positive definite: {all_positive_definite(mab)}")
    exps = mab.predict_expectations(test_contexts)

    recs = [max(user_exps, key=user_exps.get).item() for user_exps in exps]
    return recs

### Option 1
We use the default implementation, which uses `np.random.multivariate_random`, and set the global seed to ensure reproducibility in a single environment. Note that this is the same as using `np.random.RandomState`, as the global seed sets the random state.

In [3]:
randomstate_recs = run_goodreads('randomstate')
print(len(randomstate_recs))
print(randomstate_recs[:10])

All covariances positive definite: True
2691
[51738, 2914097, 2914097, 16068905, 12578077, 5996153, 10576365, 30183, 17878931, 7896527]


### Option 2
We use the new `Generator` class with default parameters, which internally uses SVD for decomposition:

In [4]:
svd_recs = run_goodreads('svd')
print(len(svd_recs))
print(svd_recs[:10])

All covariances positive definite: True
2691
[51738, 18635016, 7631105, 12000020, 4502507, 14290364, 7631105, 39988, 18460392, 7631105]


### Option 3
We use Cholesky decomposition with the new Generator class. Our hypothesis is that this will produce reproducible results across different environments.

In [5]:
cholesky_recs = run_goodreads('cholesky')
print(len(cholesky_recs))
print(cholesky_recs[:10])

All covariances positive definite: True
2691
[22232, 13047090, 693208, 20613470, 693208, 9593911, 23395680, 30183, 11387515, 20613470]


We save all the results for analysis.

In [6]:
recs = pd.DataFrame({
    'randomstate': randomstate_recs,
    'svd': svd_recs,
    'cholesky': cholesky_recs,
})
recs

Unnamed: 0,randomstate,svd,cholesky
0,51738,51738,22232
1,2914097,18635016,13047090
2,2914097,7631105,693208
3,16068905,12000020,20613470
4,12578077,4502507,693208
...,...,...,...
2686,22232,6442769,693208
2687,8492825,13414446,693208
2688,9520360,12000020,23395680
2689,693208,14290364,15749186


In [7]:
os.makedirs('output', exist_ok=True)
recs.to_csv(os.path.join('output', 'linuxubuntu_openblas_recs.csv'), index=False)