In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import pdist, squareform, cdist
from scipy.ndimage import gaussian_filter1d
from scipy.optimize import nnls
from collections import defaultdict

#from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

from lenskit.datasets import ML100K, MovieLens
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als
from lenskit import topn
from lenskit.metrics.predict import rmse, mae

import trecs
from trecs.validate import validate_user_item_inputs
from trecs.models import ContentFiltering, PopularityRecommender, ImplicitMF, SocialFiltering
from trecs.components import Users, Items, Creators
from trecs.metrics import HomogeneityMeasurement, InteractionSimilarity, Measurement
from trecs.matrix_ops import normalize_matrix

In [9]:
 Chaney ContentFiltering - uses NNLS solver
class ChaneyContent(ContentFiltering):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        
    def _update_internal_state(self, interactions):
        # update cumulative interactions
        self.cumulative_interactions[self.users.user_vector, interactions] += 1
        
    def train(self):
        if hasattr(self, 'cumulative_interactions') and self.cumulative_interactions.sum() > 0: # if there are interactions present:
            for i in range(self.num_users):
                items_to_train = self.cumulative_interactions.shape[1] # can't train representations for new items before interactions have happened!
                item_attr = self.items_hat[:, :items_to_train].T
                self.users_hat[i, :] = nnls(item_attr, self.cumulative_interactions[i, :])[0] # solve for Content Filtering representation using nnls solver
                num_new_items = self.items_hat.shape[1] - items_to_train
                if num_new_items > 0:
                    self.cumulative_interactions = np.hstack([self.cumulative_interactions, np.zeros((self.num_users, num_new_items))]) # add new items to cumulative interactions
        else:
            self.cumulative_interactions = np.zeros((self.users_hat.shape[0], self.items_hat.shape[1]))
        super().train()
        
# random recommender - randomly update users at every step
class RandomRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        self.users_hat[:, :] = np.random.rand(*self.users_hat.shape)
        self.items_hat[:, :] = np.random.rand(*self.items_hat.shape)
        
    def process_new_items(self, new_items):
        """
        Generate random attributes for new items.
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = np.random.rand(num_attr, num_items)
        self.items_hat = np.hstack([self.items_hat, item_representation])

# basically just recommends items based on the estimates of user preferences!
# this will form the basis of our "ideal" recommender
class IdealRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        # do not change users_hat! 
        pass
    
    def process_new_items(self, new_items):
        """
        Generate zero attributes for new items. Remember,
        this doesn't actually matter because the IdealRecommender
        uses its perfect score function, not
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = np.random.rand(num_attr, num_items)
        self.items_hat = np.hstack([self.items_hat, item_representation])
    

    
class SimilarUserInteractionSimilarity(Measurement):
    """
    Keeps track of the average Jaccard similarity between items seen by pairs
    of users at each timestep. The pairs of users must be passed in by the
    user.

    Parameters
    -----------
        verbose: bool (optional, default: False)
            If True, enables verbose mode. Disabled by default.

    Attributes
    -----------
        Inherited by Measurement: :class:`.Measurement`

        name: str (optional, default: "similar_user_jaccard")
            Name of the measurement component.
    """

    def __init__(self, name="similar_user_jaccard", verbose=False):
        self.interaction_hist = None
        Measurement.__init__(self, name, verbose, init_value=None)

    def measure(self, recommender, **kwargs):
        """
        Measures the average Jaccard index of items shown to pairs of users in
        the system, where each pair consists of a user and the user most similar
        to that user, according to the system's internal representation.
        Intuitively, a higher average Jaccard index corresponds to
        increasing "homogenization" in that the recommender system is starting
        to treat each user the same way (i.e., show them the same items).

        Parameters
        ------------
            recommender: :class:`~models.recommender.BaseRecommender`
                Model that inherits from
                :class:`~models.recommender.BaseRecommender`.

            **kwargs
                Keyword arguments, one of which must be `items_shown`, a |U| x
                num_items_per_iter matrix that contains the indices of every
                item shown to every user at a particular timestep.
        """
        
        interactions = kwargs.pop("interactions", None)
        if interactions is None:
            raise ValueError(
                "interactions must be passed in to InteractionSimilarity's `measure` "
                "method as a keyword argument"
            )

        if self.interaction_hist is None:
            self.interaction_hist = np.copy(interactions).reshape((-1, 1))
        else:
            self.interaction_hist = np.hstack([self.interaction_hist, interactions.reshape((-1, 1))])
        # generate cosine similarity matrix for all users
        sim_matrix = cosine_similarity(recommender.users_hat, recommender.users_hat)
        # set diagonal entries to zero
        num_users = sim_matrix.shape[0]
        sim_matrix[np.arange(num_users), np.arange(num_users)] = 0
        # array where element x at index i represents the "most similar" user to user i
        closest_users = np.argmax(sim_matrix, axis=1)
        
        # calculate average jaccard similarity
        similarity = 0
        for user1, user2 in enumerate(closest_users):
            itemset_1 = set(self.interaction_hist[user1, :])
            itemset_2 = set(self.interaction_hist[user2, :])
            common = len(itemset_1.intersection(itemset_2))
            union = len(itemset_1.union(itemset_2))
            similarity += common / union / num_users
        self.observe(similarity)
        

def gen_social_network(user_prefs):
    """ Generates a |U|x|U| social network of connections
        as specified in Chaney et al.
    """
    user_cov = np.cov(user_prefs)
    possible_thresholds = np.sort(user_cov.flatten())[::-1]
    user_connections = None
    for thresh in possible_thresholds[num_users:]:
        num_connected = (user_cov >= thresh).any(axis=1).sum()
        if num_connected == num_users:
            return (user_cov >= thresh).astype(int) # final adjacency matrix
    raise RuntimeError("Could not find a suitable threshold.")

    
def mu_sigma_to_alpha_beta(mu, sigma):
    """ For Chaney's custom Beta' function, we convert
        a mean and variance to an alpha and beta parameter
        of a Beta function. See footnote 3 page 3 of Chaney
        et al. for details.
    """
    alpha = ((1-mu) / (sigma**2) - (1/mu)) * mu**2
    beta = alpha * (1/mu - 1)
    return alpha, beta

def perfect_scores_first_items(max_items, true_scores):
    """ This custom scoring function ensures that all items in the system
        that are created after a certain point are given a score of negative
        infinity, ensuring that they will be at the very bottom of any recommendation
        list. Otherwise, we return the "true scores" specified in the
        true_scores array.
    """
    def score_fn(*args):
        predicted_scores = true_scores
        # all predicted scores for these "new" items will be negative infinity,
        # ensuring they never get recommended
        predicted_scores[:, max_items:] = float('-inf')
        return predicted_scores
    return score_fn

# utility function to extract measurement
def process_measurement(model, metric_string):
    return model.get_measurements()[metric_string][1:]



In [56]:
9400*.2288

2150.7200000000003

In [11]:
# generate user vector
generator = np.random.default_rng()
# parameters from section 5 of paper
num_users = 100
num_items = 10000
num_attrs = 20
num_sims = 1

user_params = np.random.dirichlet(np.ones(num_attrs), size=num_sims) * 10
item_params = np.random.dirichlet(np.ones(num_attrs) * 100, size=num_sims) * 0.1
# do conversion from paper to fiure out the utility percentage visible to users
mu_n = 0.98  
sigma = 1e-5

# each element in users is the users vector in one simulation
users = []
items = []
true_utilities = []
known_util_perc = []
known_utils = []
social_networks = []
for sim_index in range(num_sims):
    # generate user preferences and item attributes
    user_prefs = np.random.dirichlet(user_params[sim_index, :], size=num_users) # 100 users
    item_attrs = np.random.dirichlet(item_params[sim_index, :], size=num_items) # 200 items
    
    # mean of the utility distribution
    true_utils_mu = user_prefs @ item_attrs.T
    true_utils_mu = np.clip(true_utils_mu, 1e-9, None) # avoid numerical stability issues
    # sample total utility from a beta distribution
    alphas, betas = mu_sigma_to_alpha_beta(true_utils_mu, sigma)
    true_utils = np.random.beta(alphas.flatten(), betas.flatten()).reshape((num_users, num_items))
    # assert support of true utilities; should be within 0 and 1
    assert true_utils.min() >= 0 and true_utils.max() <= 1
    
    # calculate known utility fo reach user
    alpha, beta = mu_sigma_to_alpha_beta(mu_n, sigma) # parameters for beta function governing percentage of utility known to users
    perc_known = np.random.beta(alpha, beta, size=(num_users, num_items))
    known_util = true_utils * perc_known 
    
    # add all synthetic data to list
    users.append(user_prefs) 
    social_networks.append(gen_social_network(user_prefs))
    items.append(item_attrs) 
    true_utilities.append(true_utils)
    known_util_perc.append(perc_known)
    known_utils.append(known_util)
    
    
# print shape
print(users[0].shape) # we should see 100 users with 20 attributes
print(items[0].shape) # we should see 1000 items with 20 attributes
print(true_utilities[0].shape)
print(known_utils[0].shape)

(100, 20)
(10000, 20)
(100, 10000)
(100, 10000)


In [24]:
model_keys = ["ideal", "content_chaney", "mf", "sf", "popularity", "random"]
# stores results for each type of model for each type of user pairing (random or cosine similarity)
rep_train_results = {"sim_users": defaultdict(list), "random_users": defaultdict(list)}

startup_iters = 10
sim_iters = 190
# simpler way to pass common arguments
init_params = {
    "num_items_per_iter": 100
}

run_params = {
    "train_between_steps": False,
    "random_items_per_iter": 10,
    "vary_random_items_per_iter": False,
    "timesteps": sim_iters,
    "repeated_items": False
}

for i in range(num_sims):
    true_prefs = users[i] # underlying true preferences
    true_scores = true_utilities[i]
    noisy_scores = known_utils[0]
    item_representation = items[i].T

    # generate random pairs for evaluating jaccard similarity
    pairs = [np.random.choice(num_users, 2, replace=False) for _ in range(800)]

    # each user interacts with items based on their (noisy) knowledge of their own scores
    # user choices also depend on the order of items they are recommended
    u = Users(actual_user_scores = noisy_scores, size=(num_users, num_attrs), num_users=num_users, attention_exp=-0.8) 
    # recommender has the ideal item representation and ideal user representation
#     ideal_rec = IdealRecommender(user_representation=true_prefs, item_representation=item_representation, actual_user_representation=u, score_fn=perfect_score_fn(true_scores), **init_params)
#     ideal_rec.add_metrics(InteractionSimilarity(pairs), SimilarUserInteractionSimilarity())
#     ideal_rec.startup_and_train(timesteps=startup_iters)
#     ideal_rec.run(**run_params)
    
#     # chaney content filtering
#     chaney = ChaneyContent(item_representation=item_representation, actual_user_representation=u, **init_params)
#     chaney.add_metrics(InteractionSimilarity(pairs), SimilarUserInteractionSimilarity())
#     chaney.startup_and_train(timesteps=startup_iters)
    
#     chaney.run(**run_params)
    
    # matrix factorization
    mf = ImplicitMF(actual_item_representation=item_representation, actual_user_representation=u, **init_params, num_latent_factors=20)
    mf.add_metrics(InteractionSimilarity(pairs), SimilarUserInteractionSimilarity())
    mf.startup_and_train(timesteps=startup_iters)
    
    mf.run(reset_interactions=False, **run_params)   
    # social filtering
    # translate covariance matrix into social network (enforces homophily)            
#     sf = SocialFiltering(user_representation=social_networks[i], actual_item_representation=item_representation, actual_user_representation=u, **init_params)
#     sf.add_metrics(InteractionSimilarity(pairs), SimilarUserInteractionSimilarity())
#     sf.startup_and_train(timesteps=startup_iters)
#     sf.run(**run_params)

#     # popularity model
#     p = PopularityRecommender(actual_item_representation=item_representation, actual_user_representation=u, num_items=num_items, num_users=num_users, **init_params)
#     p.add_metrics(InteractionSimilarity(pairs), SimilarUserInteractionSimilarity())
#     p.startup_and_train(timesteps=startup_iters)
#     p.run(**run_params)
    
#     # random recommender
#     r = RandomRecommender(item_representation=item_representation, actual_user_representation=u, num_items=num_items, num_users=num_users, **init_params)
#     r.add_metrics(InteractionSimilarity(pairs), InteractionSimilarity(pairs, name="similar_user_jaccard")) # random pairing of users
#     r.startup_and_train(timesteps=startup_iters)
#     r.run(**run_params)

100%|██████████| 10/10 [00:00<00:00, 51.57it/s]
100%|██████████| 190/190 [01:21<00:00,  2.33it/s]


In [37]:
user_features_trecs = pd.DataFrame(np.array(mf.users_hat))
user_features_trecs.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.233727,0.586008,0.531369,0.459478,0.369985,0.282958,0.227706,0.488946,0.415574,0.570607,0.207428,0.239605,0.436981,0.307556,0.433864,0.943647,0.495035,0.23762,0.248883,0.132991
1,0.134276,0.121072,0.21905,0.258779,0.445026,0.691336,0.799857,0.350968,0.357942,0.28534,0.591776,0.411416,0.291154,0.379394,0.497213,0.52906,0.159543,0.258023,0.38814,0.392442
2,0.433166,0.419481,0.460441,0.372561,0.533204,0.572539,0.491059,0.441533,0.127092,0.589199,0.166948,-0.011296,0.377082,0.410812,0.672253,0.296302,0.410625,0.345755,-0.016455,0.586131
3,0.239801,0.62874,0.356082,0.464571,0.581694,0.164922,0.448232,0.380023,0.254949,0.52299,0.642544,0.503843,0.211244,0.49255,0.214883,0.305297,0.075799,0.355816,0.233291,0.185995
4,0.143583,0.645941,0.471519,0.378099,0.349032,0.426958,0.150784,0.286366,0.426371,0.138303,0.657126,0.450981,0.496337,0.679094,0.412909,0.368568,0.769501,0.534139,0.534653,0.349806
5,0.291666,0.482328,0.349315,0.695899,0.606811,0.473556,0.1208,0.04533,0.182233,0.286032,0.331812,0.528089,0.388246,0.386401,0.237589,0.570662,0.604405,0.418381,0.285274,0.52709
6,0.72773,0.442358,0.250828,0.683509,0.545659,0.411957,-0.025606,0.206802,0.612035,0.272257,0.411016,0.373218,0.247021,0.410287,0.220707,0.546108,0.472165,0.423542,0.355456,0.636027
7,0.142664,0.588627,0.330691,0.21377,0.355483,0.723647,0.183263,0.618859,0.226155,0.557694,0.057165,0.48996,0.001614,0.453015,0.317279,0.281921,0.585083,0.312423,0.253266,0.596669
8,0.076145,0.285646,0.154074,0.343275,0.579681,0.182282,0.254645,0.566886,0.140611,0.683262,0.456119,0.479497,0.430818,0.523143,0.228613,0.331441,0.244901,0.388928,0.298478,0.135962
9,0.485481,0.424224,0.277404,0.696979,0.535777,0.242014,0.257241,0.537106,0.791696,0.27442,0.1468,0.399406,0.146353,0.574217,-0.00135,0.45224,0.384829,0.507351,0.638051,0.556212


In [55]:
mlsmall = MovieLens('../data/ml-latest-small')
ratings = mlsmall.ratings
ratings['timerank'] = ratings.groupby('user')['timestamp'].transform('rank', method='min')
ratings['interaction_count'] = ratings.groupby('user')['user'].transform('count')
ratings['max_timerank'] = ratings.groupby('user')['timerank'].transform('max')
ratings = ratings[ratings['interaction_count']>=30]

#train_ratings = ratings[ratings['timerank']<=20]
#test_ratings = ratings[ratings['timerank']]


ratings.sort_values(['user', 'timerank'], ascending=False).head(30)

Unnamed: 0,user,item,rating,timestamp,timerank,interaction_count,max_timerank
99775,610,3917,4.0,1495959411,1302.0,1302,1302.0
99681,610,2459,3.5,1495959405,1301.0,1302,1302.0
99556,610,328,3.5,1495959299,1300.0,1302,1302.0
99540,610,70,4.0,1495959282,1299.0,1302,1302.0
100612,610,101739,3.5,1495959269,1298.0,1302,1302.0
99751,610,3727,4.0,1495959265,1297.0,1302,1302.0
99693,610,2901,4.0,1495959241,1296.0,1302,1302.0
99745,610,3696,3.5,1495959233,1295.0,1302,1302.0
100355,610,63826,4.0,1495959194,1294.0,1302,1302.0
99703,610,3016,4.5,1495959190,1293.0,1302,1302.0


In [46]:
#retain only rows with high ratings for interaction proxies
ratings = ratings[ratings['rating']>=4]
ratings = ratings[['user', 'item']]
ratings.shape

(48580, 2)

In [47]:
ratings['interaction_count'] = ratings.groupby('user')['user'].transform('count')
ratings = ratings['interaction_count']

Unnamed: 0,user,item,interaction_count
0,1,1,200
1,1,3,200
2,1,6,200
3,1,47,200
4,1,50,200


In [35]:
algo_als = als.ImplicitMF(10)
algo_als.fit(ratings)


<lenskit.algorithms.als.ImplicitMF at 0x7ffc845e0190>

In [36]:
user_features_ml = pd.DataFrame(algo_als.user_features_)
user_features_ml.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,25.232538,28.284647,23.977615,32.352316,28.031643,26.001589,29.736585,29.064304,29.523152,31.679507
1,0.690613,-0.572217,1.166222,4.744884,1.296119,4.937452,5.920523,1.981871,2.255876,1.072728
2,-0.355157,3.580889,2.649295,6.14707,1.138704,-0.901334,1.558611,2.536019,2.541575,1.975843
3,17.935001,15.317469,15.259318,17.767906,20.638412,16.343103,18.122447,22.990882,14.94685,19.183812
4,1.056877,-0.557862,1.55299,1.510469,2.67291,3.69553,4.770503,5.113976,4.065124,2.834514


In [40]:
print(algo_als.n_users)
print(algo_als.n_items)

609
6298
