# Initialization

In [444]:
import os, sys
import time
import numpy as np
import pandas as pd
import random
from scipy import stats as st
import itertools
import operator

import torch

from tqdm.notebook import trange
from tqdm import tqdm

random_state = np.random.RandomState(2020)

In [445]:
# get currently working directory
base_dir = os.getcwd()

# load functions from other notebooks
helpers_file = os.path.join(base_dir, 'helpers.ipynb')
%run $helpers_file

# Load spotlight module
for p in ['../spotlight_ext']:
    module_path = os.path.abspath(os.path.join(base_dir, p))
    if module_path not in sys.path:
        sys.path.append(module_path)

# Load Dataset

## Models

In [446]:
lstm_model = load_model(model_type='entire')
pooling_model = load_model('pooling')

pretrained_models = {
    'lstm': lstm_model,
    'pooling': pooling_model,
}

## Dataset

In [447]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset

# get dataset
dataset = get_movielens_dataset(variant='1M')
train, test = random_train_test_split(dataset, random_state=random_state)

max_sequence_length = 20
train = train.to_sequence(max_sequence_length=max_sequence_length)
test = test.to_sequence(max_sequence_length=max_sequence_length)

# Genetic Search

## Initialization

target item is 149 in this test case, top k is 10

In [448]:
test_interaction = test.sequences[test.user_ids == 3][0].copy()
test_interaction = test_interaction[test_interaction != 0]
test_interaction.sort()
test_interaction

array([ 59, 114, 124, 125, 177, 186, 190, 191, 196, 197, 200], dtype=int32)

In [449]:
len_test_interaction = len(test_interaction)
len_test_interaction

11

In [450]:
def get_position_item(model, test_interaction, position=1):
    prediction = model.predict(test_interaction)
    prediction[test_interaction] = -StaticVars.FLOAT_MAX
    rk_data = st.rankdata(-prediction, method='ordinal')
    index = np.where(rk_data == position)
    return index[0][0]

In [451]:
get_position_item(pooling_model, test_interaction, 2)

167

## Random CF candidate selection

In [452]:
import numpy as np

def generate_random_sublists(original_list, sublists_info):
    result_sublists = []
    rng = np.random.default_rng(seed=2020)  # Seed for reproducibility

    for length, count in sublists_info.items():
        generated_sublists_for_length = set()

        while len(generated_sublists_for_length) < count:
            sublist = tuple(rng.choice(original_list, length, replace=False))
            generated_sublists_for_length.add(sublist)

        result_sublists.extend(np.array(list(sublist)) for sublist in generated_sublists_for_length)

    return result_sublists


## Crossover and Mutation

In [453]:
import numpy as np

def crossover(first_list, second_list, p_cross):
    # Find the shorter length among the two lists
    length_first = len(first_list)
    length_second = len(second_list)
    shorter_length = min(length_first, length_second)
    
    # Compute the number of crossover points
    num_crossovers = int(shorter_length * p_cross)
    
    # Choose random indices for crossover within the range of shorter length
    rng = np.random.default_rng(seed=2020)
    crossover_indices_first = rng.choice(shorter_length, num_crossovers, replace=False)
    crossover_indices_second = rng.choice(shorter_length, num_crossovers, replace=False)
    
    # Sort the crossover indices
    crossover_indices_first.sort()
    crossover_indices_second.sort()
    
    # Swap the elements at the crossover indices
    for i in range(num_crossovers):
        index_first = crossover_indices_first[i]
        index_second = crossover_indices_second[i]
        first_list[index_first], second_list[index_second] = second_list[index_second], first_list[index_first]
    
    return first_list, second_list

def mutate_array(org_arr, arr_to_mutate, mutation_probability):
    # Calculate the number of elements to mutate
    num_mutations = int(mutation_probability * len(arr_to_mutate))
    rng = np.random.default_rng(seed=2020)
    # Select the indices to mutate
    indices_to_mutate = rng.choice(range(len(arr_to_mutate)), size=num_mutations, replace=False)
    
    # Mutate the selected elements
    for idx in indices_to_mutate:
        arr_to_mutate[idx] = rng.choice(org_arr)

    return arr_to_mutate


def remove_duplicates(arr):
    _, idx = np.unique(arr, return_index=True)
    return arr[np.sort(idx)]

## Loss Functions

In [454]:
from itertools import combinations, chain
import numpy as np
import scipy.stats as st

class StaticVars:
    FLOAT_MAX = float('inf')

def supersets_of_new_subsets_of_old(new_cf, old_cf):
    diff = np.setdiff1d(old_cf, new_cf)  # Elements that are in old_cf but not in new_cf
    for r in range(1, len(diff) + 1):
        for subset in combinations(diff, r):
            yield np.union1d(new_cf, subset)

def compute_yloss(target_score, kth_score):
    yloss = max(0, target_score / kth_score - 1.0)
    return yloss

def compute_distance(x, y):
    diff = np.setdiff1d(x, y)
    return len(diff)

def compute_loss(old_cf, new_cf, model, target_item, top_k, yloss_cache):
    cache_key = frozenset(new_cf)
    if cache_key in yloss_cache:
        yloss = yloss_cache[cache_key]
    else:
        new_prediction = model.predict(new_cf)
        new_prediction[new_cf] = -StaticVars.FLOAT_MAX
        new_rk_data = st.rankdata(-new_prediction, method='ordinal')

        top_k_index = np.where(new_rk_data == top_k)[0][0]
        yloss = compute_yloss(new_prediction[target_item], new_prediction[top_k_index])
        yloss_cache[cache_key] = yloss
    dis = compute_distance(old_cf, new_cf)

    subset_yloss = 0
    for superset in supersets_of_new_subsets_of_old(new_cf, old_cf):
        cache_key = frozenset(superset)
        if cache_key in yloss_cache:
            subset_yloss += yloss_cache[cache_key]
        else:
            subset_prediction = model.predict(superset)
            subset_prediction[superset] = -StaticVars.FLOAT_MAX
            sub_rk_data = st.rankdata(-subset_prediction, method='ordinal')
            sub_top_k_index = np.where(sub_rk_data == top_k)[0][0]
            subset_yloss += compute_yloss(subset_prediction[target_item], subset_prediction[sub_top_k_index])
            yloss_cache[cache_key] = subset_yloss

    return list([yloss, dis, subset_yloss])


# NSGA-II
Apply NSGA-II to the problem of finding the optimal candicates in multi-objective optimization problem.
Based on:
- Non-domination Rank
- Crowding Distance

In [455]:
def dominates(row, candidateRow):
    """Determine if one solution dominates another"""
    return all(r <= cr for r, cr in zip(row, candidateRow)) and any(r < cr for r, cr in zip(row, candidateRow))

def crowding_distance_assignment(front, values):
    distances = [0] * len(values)  # Initialize the distance for every solution as 0
    num_objs = len(values[0])
    
    for m in range(num_objs):
        sorted_front = sorted(front, key=lambda x: values[x][m])

        # Assign infinite distance at boundaries.
        distances[sorted_front[0]] = distances[sorted_front[-1]] = float('inf')

        # Normalize the objective values for distance computation.
        obj_min = values[sorted_front[0]][m]
        obj_max = values[sorted_front[-1]][m]
        denom = obj_max - obj_min if obj_max != obj_min else 1

        for i in range(1, len(sorted_front) - 1):
            distances[sorted_front[i]] += (values[sorted_front[i + 1]][m] - values[sorted_front[i - 1]][m]) / denom

    return distances



def fast_nondominated_sort(values):
    """NSGA-II's fast non-dominated sort"""
    S = [[] for _ in range(len(values))]
    front = [[]]
    n = [0 for _ in range(len(values))]
    rank = [-1 for _ in range(len(values))]
    
    for p in range(len(values)):
        S[p] = []
        n[p] = 0
        for q in range(len(values)):
            if dominates(values[p], values[q]):
                S[p].append(q)
            elif dominates(values[q], values[p]):
                n[p] += 1
        if n[p] == 0:
            rank[p] = 0
            front[0].append(p)
            
    i = 0
    while front[i]:
        nextFront = []
        for p in front[i]:
            for q in S[p]:
                n[q] = n[q] - 1
                if n[q] == 0:
                    rank[q] = i + 1
                    nextFront.append(q)
        i = i + 1
        front.append(nextFront)

    del front[len(front) - 1]
    
    # Initialize crowding distances as zeros
    crowding_distances = [0] * len(values)
    
    for front_solutions in front:
        current_front_distances = crowding_distance_assignment(front_solutions, values)
        for j, solution in enumerate(front_solutions):
            crowding_distances[solution] = current_front_distances[solution]
    
    return rank, crowding_distances


# Pipeline

In [456]:
def generate_random_pairs(list_of_arrays, n):
    # Generate all possible pairs
    random.seed(2020)
    all_pairs = list(itertools.combinations(list_of_arrays, 2))

    # Randomly select n pairs
    random_pairs = random.sample(all_pairs, n)

    return random_pairs

In [457]:
def generation(interaction, candidates, model, target, k, yloss_cache, crossover_p, mutation_p):
    print(len(candidates))
    pairs = generate_random_pairs(candidates, len(candidates)//2)
    for first, second in pairs:
        first, second = crossover(first, second, crossover_p)
        first = mutate_array(interaction, first, mutation_p)
        second = mutate_array(interaction, second, mutation_p)
        first = remove_duplicates(first)
        second = remove_duplicates(second)
        candidates.append(first)
        candidates.append(second)
    print(len(candidates))
    losses = [compute_loss(interaction, arr, model, target, k, yloss_cache) for arr in candidates]
    print(losses)
    solved = False
    solved_list = []
    for i in range(len(losses)):
        if losses[i][0] == 0:
            solved = True
            solved_list.append(candidates[i])
    if solved:
        return solved_list, solved
    ranks, crowding_distances = fast_nondominated_sort(losses)
    print(ranks)
    candidates_with_metrics = list(zip(candidates, ranks, crowding_distances))

    # Sort based on ranks (ascending) and then crowding distances (descending)
    candidates_with_metrics.sort(key=lambda x: (x[1], -x[2]))

    # Extract candidates after sorting
    sorted_candidates = [pair[0] for pair in candidates_with_metrics]

    # Extract the top third of candidates
    least_loss_arrays = sorted_candidates[:len(sorted_candidates)//3]

    return least_loss_arrays, solved

In [458]:
def main(model, test_interaction, rank, sublists_info, top_k, crossover_p, mutation_p):
    target = get_position_item(model, test_interaction, rank)
    new_gen = generate_random_sublists(test_interaction, sublists_info)
    solved = False
    yloss_cache = {}
    while solved is not True:
        new_gen, solved = generation(test_interaction, new_gen, model, target, top_k, yloss_cache, crossover_p, mutation_p)
    return new_gen

In [459]:
# yloss_cache = {}
sublists_info ={
    5:5,
    6:5,
    7:5,
    8:5,
    9:5,
    10:5
}
# target = get_position_item(pooling_model, test_interaction, 1)
# first_gen = generate_random_sublists(test_interaction, sublists_info)
# generation(test_interaction, first_gen, lstm_model, target, 10, yloss_cache, 0.3, 0.2)
main(pooling_model, test_interaction, 1, sublists_info, 10, 0.3, 0.2)

30


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (435, 2) + inhomogeneous part.