In [44]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict

In [74]:
data = pd.read_csv('data/sample_df.csv')
sequence_df = pd.read_csv('data/preprocessed_df.csv')
embeddings_df = pd.read_csv('data/embeddings.csv')

In [82]:
def file_convert(sequence_data):
    """Seperate ratings and items"""
    for col in ['state', 'next_state', 'action_reward']:
        sequence_data[col] = [np.array([[k for k in ee.split('&')] for ee in e.split('|')]) for e in sequence_data[col]]
    for col in ['state', 'next_state']:
        sequence_data[col] = [np.array([e[0] for e in l]) for l in sequence_data[col]]

    sequence_data['action'] = [[e[0] for e in l] for l in sequence_data['action_reward']]
    sequence_data['reward'] = [tuple(e[1] for e in l) for l in sequence_data['action_reward']]
    sequence_data.drop(columns=['action_reward'], inplace=True)
    return sequence_data   
def read_embeddings(embeddings_file):
    '''Load embeddings (a vector for each item).''' 
    return np.array([[np.float64(k) for k in e.split('|')]
                   for e in embeddings_file['embedding']])
def create_item_mappings(embeddings_df):
    """since items are strings, map them with integer index"""
    item_mappings_dict = {}
    for _,row in embeddings_df.iterrows():
        item_mappings_dict[row['item']] = int(_)
    return item_mappings_dict

In [91]:
"""helper class for reading embeddings"""
class Embeddings:
    def __init__(self, item_embeddings,item_mapping_dict):
        self.item_embeddings = item_embeddings
        self.item_mapping_dict = item_mapping_dict
  
    def size(self):
        return self.item_embeddings.shape[1]

    def get_embedding_vector(self):
        return self.item_embeddings

    def get_embedding(self, item):
        item_index = self.item_mapping_dict[item]
        return self.item_embeddings[item_index]

    def embed(self, item_list):
        return np.array([self.get_embedding(item) for item in item_list])

### Environment/Simulator

In [60]:
import gym
from gym import Env, spaces
from gym.utils import seeding
from gym.envs.registration import register

In [61]:
import sys
sys.modules[__name__]

<module '__main__'>

In [62]:
def register_env():
    register(id="recsys-v0",entry_point = "__main__:recsys") #change module (__main__) when converting to python script

In [117]:
class recsys(Env):
    def __init__(self,data,embeddings,alpha,gamma,fixed_length):
        self.embeddings = embeddings
        self.embedded_data = pd.DataFrame()
        self.embedded_data['state'] = [np.array([embeddings.get_embedding(item_id)
                                                for item_id in row['state']]) for _,row in data.iterrows()]
        self.embedded_data['action'] = [np.array([embeddings.get_embedding(item_id) for item_id in row['action']])
                                       for _,row in data.iterrows()]
        self.embedded_data['reward'] = data['reward']
        self.alpha = alpha
        self.gamma = gamma
        self.fixed_length = fixed_length
        self.current_state = self.reset()
        self.groups = self.get_groups
    
    def reset(self):
        self.init_state = self.embedded_data['state'].sample(1).values[0]
    def step(self,actions):
        '''
        Compute reward and update state.
        Args:
          actions: embedded chosen items.
        Returns:
          cumulated_reward: overall reward.
          current_state: updated state.
        '''
        #compute overall reward according to equation 4 in RL Listwise recommender paper
        simulated_rewards, cumulative_rewards = self.simulate_rewards(self.current_state.reshape((1, -1)), actions.reshape((1, -1)))
        #Simulator memory from algorithm 1
        for k in range(len(simulated_rewards)):
            if simulated_rewards[k] > 0: #if positive reward then append action to the end of current state
                self.current_state = np.append(self.current_state,actions[k],axis = 0)
                if self.fixed_length:
                    #remove the first item from current_state to keep the simulator memory constant
                    self.current_state = np.delete(self.current_state,0,axis = 0)
        return cumulative_rewards, self.current_state
       
    def get_groups(self):
        """calculate average state action value for each group in dataframe rewards (eqn 3)"""
        groups = []
        for rewards,group in self.embedded_data.groupby(['reward']):
            size = group.shape[0]
            states = np.array(list(group['state'].values))
            actions = np.array(list(group['action'].values))
            groups.append({
                'size':size, #Nx
                'rewards': rewards, # U_x (combination of rewards)
                'average state': (np.sum(states / np.linalg.norm(states, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)), # s_x^-
                'average action': (np.sum(actions / np.linalg.norm(actions, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)) # a_x^-
            })                
        return groups
    
    def simulate_rewards(self,current_state,action): 
        #we'll assume only one reward type which is grouped cosine according to the RL paper
        '''
        Calculate simulated rewards.
        Args:
          current_state: history, list of embedded items.
          action: embedded chosen item.
        Returns:
          returned_rewards: argmax of probable rewards
          cumulated_reward: probability weighted rewards.
        '''
        def cosine_state_action(s_t, a_t, s_i, a_i):
            #Calculate cosine similarity between (state,action) pair
            cosine_state = np.dot(s_t, s_i.T) / (np.linalg.norm(s_t, 2) * np.linalg.norm(s_i, 2))
            cosine_action = np.dot(a_t, a_i.T) / (np.linalg.norm(a_t, 2) * np.linalg.norm(a_i, 2))
        return (self.alpha * cosine_state + (1 - self.alpha) * cosine_action).reshape((1,))
        
        #Calculate simulated rewards by grouped cosine (equation 1 and 3)
        probabilities = [cosine_state_action(current_state, action, g['average state'], g['average action']) 
                         for g in self.groups]
        #normalize probabilities to 1
        probabilities = np.array(probabilities)/sum(probabilities)
        returned_rewards = self.groups[np.argmax(probabilities)]['rewards']
        def overall_reward(rewards,gamma):
            return np.sum([gamma**k * reward for k, reward in enumerate(rewards)])
        # Get probability weighted cumulated reward
        cumulated_reward = np.sum([p * overall_reward(g['rewards'], self.gamma) 
                                   for p, g in zip(probabilities, self.groups)])
        
        return returned_rewards, cumulated_reward
        
    
    

In [118]:
#Test code
item_map = create_item_mappings(embeddings_df)
embeddings = Embeddings(read_embeddings(embeddings_df),item_map)
data = file_convert(sequence_df.copy())
register_env()
env = gym.make('recsys-v0',data=data,embeddings=embeddings,alpha=0.5,gamma=1,fixed_length=10) #random parameters

In [119]:
#Further steps -> coding actor and critic network, replay memory, train functions and evaluations