# Initialization

In [1]:
import os, sys
import time
import numpy as np
import pandas as pd
import random
from scipy import stats as st
import itertools
import operator

import torch

from tqdm.notebook import trange
from tqdm import tqdm

random_state = np.random.RandomState(2020)

In [2]:
# get currently working directory
base_dir = os.getcwd()

# load functions from other notebooks
helpers_file = os.path.join(base_dir, 'helpers.ipynb')
%run $helpers_file

# Load spotlight module
for p in ['../spotlight_ext']:
    module_path = os.path.abspath(os.path.join(base_dir, p))
    if module_path not in sys.path:
        sys.path.append(module_path)

# Load Dataset

## Models

In [3]:
lstm_model = load_model(model_type='entire')
pooling_model = load_model('pooling')

pretrained_models = {
    'lstm': lstm_model,
    'pooling': pooling_model,
}

## Dataset

In [4]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset

# get dataset
dataset = get_movielens_dataset(variant='1M')
train, test = random_train_test_split(dataset, random_state=random_state)

max_sequence_length = 20
train = train.to_sequence(max_sequence_length=max_sequence_length)
test = test.to_sequence(max_sequence_length=max_sequence_length)

# Genetic Search

## Initialization

target item is 149 in this test case, top k is 10

In [5]:
test_interaction = test.sequences[test.user_ids == 3][0].copy()
test_interaction = test_interaction[test_interaction != 0]
test_interaction.sort()
test_interaction

array([ 59, 114, 124, 125, 177, 186, 190, 191, 196, 197, 200], dtype=int32)

In [6]:
len_test_interaction = len(test_interaction)
len_test_interaction

11

In [7]:
def get_position_item(model, test_interaction, position=1):
    prediction = model.predict(test_interaction)
    prediction[test_interaction] = -StaticVars.FLOAT_MAX
    rk_data = st.rankdata(-prediction, method='ordinal')
    index = np.where(rk_data == position)
    return index[0][0]

In [8]:
get_position_item(pooling_model, test_interaction, 2)

167

## Random CF candidate selection

In [9]:
import numpy as np

def generate_random_sublists(original_list, sublists_info):
    result_sublists = []
    rng = np.random.default_rng(seed=2020)  # Seed for reproducibility

    for length, count in sublists_info.items():
        generated_sublists_for_length = set()

        while len(generated_sublists_for_length) < count:
            sublist = tuple(rng.choice(original_list, length, replace=False))
            generated_sublists_for_length.add(sublist)

        result_sublists.extend(np.array(list(sublist)) for sublist in generated_sublists_for_length)

    return result_sublists


## Crossover and Mutation

In [10]:
import numpy as np

def crossover(first_list, second_list, p_cross):
    # Find the shorter length among the two lists
    length_first = len(first_list)
    length_second = len(second_list)
    shorter_length = min(length_first, length_second)
    
    # Compute the number of crossover points
    num_crossovers = int(shorter_length * p_cross)
    
    # Choose random indices for crossover within the range of shorter length
    rng = np.random.default_rng(seed=2020)
    crossover_indices_first = rng.choice(shorter_length, num_crossovers, replace=False)
    crossover_indices_second = rng.choice(shorter_length, num_crossovers, replace=False)
    
    # Sort the crossover indices
    crossover_indices_first.sort()
    crossover_indices_second.sort()
    
    # Swap the elements at the crossover indices
    for i in range(num_crossovers):
        index_first = crossover_indices_first[i]
        index_second = crossover_indices_second[i]
        first_list[index_first], second_list[index_second] = second_list[index_second], first_list[index_first]
    
    return first_list, second_list

def mutate_array(org_arr, arr_to_mutate, mutation_probability):
    # Calculate the number of elements to mutate
    num_mutations = int(mutation_probability * len(arr_to_mutate))
    rng = np.random.default_rng(seed=2020)
    # Select the indices to mutate
    indices_to_mutate = rng.choice(range(len(arr_to_mutate)), size=num_mutations, replace=False)
    
    # Mutate the selected elements
    for idx in indices_to_mutate:
        arr_to_mutate[idx] = rng.choice(org_arr)

    return arr_to_mutate


def remove_duplicates(arr):
    _, idx = np.unique(arr, return_index=True)
    return arr[np.sort(idx)]

## Loss Functions

In [11]:
from itertools import combinations, chain
import numpy as np
import scipy.stats as st

class StaticVars:
    FLOAT_MAX = float('inf')

def supersets_of_new_subsets_of_old(new_cf, old_cf):
    diff = np.setdiff1d(old_cf, new_cf)  # Elements that are in old_cf but not in new_cf
    for r in range(1, len(diff) + 1):
        for subset in combinations(diff, r):
            yield np.union1d(new_cf, subset)

def compute_yloss(target_score, kth_score):
    yloss = max(0, target_score / kth_score - 1.0)
    return yloss

def compute_distance(x, y):
    diff = np.setdiff1d(x, y)
    return len(diff)

def compute_loss(old_cf, new_cf, model, target_item, top_k, yloss_cache):
    cache_key = frozenset(new_cf)
    if cache_key in yloss_cache:
        yloss = yloss_cache[cache_key]
    else:
        new_prediction = model.predict(new_cf)
        new_prediction[new_cf] = -StaticVars.FLOAT_MAX
        new_rk_data = st.rankdata(-new_prediction, method='ordinal')

        top_k_index = np.where(new_rk_data == top_k)[0][0]
        yloss = compute_yloss(new_prediction[target_item], new_prediction[top_k_index])
        yloss_cache[cache_key] = yloss
    dis = compute_distance(old_cf, new_cf)

    subset_yloss = 0
    for superset in supersets_of_new_subsets_of_old(new_cf, old_cf):
        cache_key = frozenset(superset)
        if cache_key in yloss_cache:
            subset_yloss += yloss_cache[cache_key]
        else:
            subset_prediction = model.predict(superset)
            subset_prediction[superset] = -StaticVars.FLOAT_MAX
            sub_rk_data = st.rankdata(-subset_prediction, method='ordinal')
            sub_top_k_index = np.where(sub_rk_data == top_k)[0][0]
            subset_yloss += compute_yloss(subset_prediction[target_item], subset_prediction[sub_top_k_index])
            yloss_cache[cache_key] = subset_yloss

    return list([yloss, dis, subset_yloss])


# NSGA-II
Apply NSGA-II to the problem of finding the optimal candicates in multi-objective optimization problem.
Based on:
- Non-domination Rank
- Crowding Distance

In [12]:
def dominates(row, candidateRow):
    """Determine if one solution dominates another"""
    return all(r <= cr for r, cr in zip(row, candidateRow)) and any(r < cr for r, cr in zip(row, candidateRow))

def crowding_distance_assignment(front, values):
    distances = [0] * len(values)  # Initialize the distance for every solution as 0
    num_objs = len(values[0])
    
    for m in range(num_objs):
        sorted_front = sorted(front, key=lambda x: values[x][m])

        # Assign infinite distance at boundaries.
        distances[sorted_front[0]] = distances[sorted_front[-1]] = float('inf')

        # Normalize the objective values for distance computation.
        obj_min = values[sorted_front[0]][m]
        obj_max = values[sorted_front[-1]][m]
        denom = obj_max - obj_min if obj_max != obj_min else 1

        for i in range(1, len(sorted_front) - 1):
            distances[sorted_front[i]] += (values[sorted_front[i + 1]][m] - values[sorted_front[i - 1]][m]) / denom

    return distances



def fast_nondominated_sort(values):
    """NSGA-II's fast non-dominated sort"""
    S = [[] for _ in range(len(values))]
    front = [[]]
    n = [0 for _ in range(len(values))]
    rank = [-1 for _ in range(len(values))]
    
    for p in range(len(values)):
        S[p] = []
        n[p] = 0
        for q in range(len(values)):
            if dominates(values[p], values[q]):
                S[p].append(q)
            elif dominates(values[q], values[p]):
                n[p] += 1
        if n[p] == 0:
            rank[p] = 0
            front[0].append(p)
            
    i = 0
    while front[i]:
        nextFront = []
        for p in front[i]:
            for q in S[p]:
                n[q] = n[q] - 1
                if n[q] == 0:
                    rank[q] = i + 1
                    nextFront.append(q)
        i = i + 1
        front.append(nextFront)

    del front[len(front) - 1]
    
    # Initialize crowding distances as zeros
    crowding_distances = [0] * len(values)
    
    for front_solutions in front:
        current_front_distances = crowding_distance_assignment(front_solutions, values)
        for j, solution in enumerate(front_solutions):
            crowding_distances[solution] = current_front_distances[solution]
    
    return rank, crowding_distances


# Pipeline

In [13]:
def generate_random_pairs(list_of_arrays, n):
    # Generate all possible pairs
    random.seed(2020)
    all_pairs = list(itertools.combinations(list_of_arrays, 2))

    # Randomly select n pairs
    random_pairs = random.sample(all_pairs, n)

    return random_pairs

In [14]:
def generation(interaction, candidates, model, target, k, yloss_cache, crossover_p, mutation_p, budget):
    # print(len(candidates))
    pairs = generate_random_pairs(candidates, len(candidates)//2)
    for first, second in pairs:
        first, second = crossover(first, second, crossover_p)
        first = mutate_array(interaction, first, mutation_p)
        second = mutate_array(interaction, second, mutation_p)
        first = remove_duplicates(first)
        second = remove_duplicates(second)
        candidates.append(first)
        candidates.append(second)
    # print(len(candidates))
    losses = [compute_loss(interaction, arr, model, target, k, yloss_cache) for arr in candidates]
    budget -= len(candidates)
    # print(losses)
    solved = False
    solved_list = []
    for i in range(len(losses)):
        if losses[i][0] == 0:
            solved = True
            solved_list.append(candidates[i])
    if solved:
        return solved_list, solved, budget
    ranks, crowding_distances = fast_nondominated_sort(losses)
    # print(ranks)
    candidates_with_metrics = list(zip(candidates, ranks, crowding_distances))

    # Sort based on ranks (ascending) and then crowding distances (descending)
    candidates_with_metrics.sort(key=lambda x: (x[1], -x[2]))

    # Extract candidates after sorting
    sorted_candidates = [pair[0] for pair in candidates_with_metrics]

    # Extract the top third of candidates
    least_loss_arrays = sorted_candidates[:len(sorted_candidates)//2]

    return least_loss_arrays, solved, budget

In [15]:
def main(model, test_interaction, rank, sublists_info, top_k, crossover_p, mutation_p, budget):
    target = get_position_item(model, test_interaction, rank)
    new_gen = generate_random_sublists(test_interaction, sublists_info)
    solved = False
    yloss_cache = {}
    while solved is not True:
        new_gen, solved, budget = generation(test_interaction, new_gen, model, target, top_k, yloss_cache, crossover_p, mutation_p, budget)
        if budget <= 0:
            break
    return new_gen, budget

In [16]:
sublists_info ={
    8:5,
    9:5,
    10:5
}
# main(pooling_model, test_interaction, 1, sublists_info, 10, 0.3, 0.2, 1000)

# Brute Force Search
Find hard cases

In [17]:
import numpy as np
import itertools

def subsets_of_array(arr, k):
    if not isinstance(arr, np.ndarray):
        raise ValueError("Input must be a numpy array")

    n = len(arr)
    subsets = []

    for size in range(n-1, n-k-1, -1):
        combinations = itertools.combinations(arr, size)
        for combo in combinations:
            subsets.append(np.array(combo))

    return subsets

In [18]:
def compute_loss_brute(new_cf, model, target_item, top_k, yloss_cache):
    cache_key = frozenset(new_cf)
    if cache_key in yloss_cache:
        yloss = yloss_cache[cache_key]
    else:
        new_prediction = model.predict(new_cf)
        new_prediction[new_cf] = -StaticVars.FLOAT_MAX
        new_rk_data = st.rankdata(-new_prediction, method='ordinal')

        top_k_index = np.where(new_rk_data == top_k)[0][0]
        yloss = compute_yloss(new_prediction[target_item], new_prediction[top_k_index])
        yloss_cache[cache_key] = yloss
    return yloss

In [19]:
def brute(candidates, model, target, top_k, yloss_cache):
    losses = [compute_loss_brute(arr, model, target, top_k, yloss_cache) for arr in candidates]
    solved = False
    for i in range(len(losses)):
        if losses[i] == 0:
            solved = True
    return solved

In [20]:
def brute_main(model, test_interaction, rank, top_k):
    target = get_position_item(model, test_interaction, rank)
    new_gen = subsets_of_array(test_interaction, 3)
    solved = False
    yloss_cache = {}
    solved = brute(new_gen, model, target, top_k, yloss_cache)
    return solved

In [21]:
# final_list = []
# for i in range(1000):
    # test_interaction = test.sequences[i].copy()
    # est_interaction = test_interaction[test_interaction != 0]
    # test_interaction.sort()
#     for j in range(1, 11):
#         solved = brute_main(pooling_model, test_interaction, j, 10)
#         if not solved:
#             print(i, j)
#             final_list.append((i, j))

# with open('final_list.txt', 'w') as file:
#     for item in final_list:
#         file.write(f"{item[0]}, {item[1]}\n")

# Testing hard case

In [22]:
with open("final_list.txt", 'r') as file:
    for line in file:
        i, j = map(int, line.split(','))
        test_interaction = test.sequences[i].copy()
        test_interaction = test_interaction[test_interaction != 0]
        test_interaction.sort()
        length_interaction = len(test_interaction)
        if length_interaction <= 1:
            continue
        elif length_interaction < 5:
            sublists_info = {
                length_interaction - 1: length_interaction
            }
        else:
            sublists_info ={
                length_interaction - 1: length_interaction//2,
                length_interaction - 2: length_interaction * (length_interaction - 1) // 8,
                length_interaction - 3: length_interaction//3,
                length_interaction - 4: length_interaction//4
            }
        output, budget = main(pooling_model, test_interaction, j, sublists_info, 10, 0.3, 0.2, 1000)
        print(budget)


887
887
592
592
864
864
728
864
864
728
831
839
679
728
728
864
-1
864
864
864
864
887
887
728
864
0
0
799
0
864
864
320
456
728
728
864
943
864
864
728
864
864
728
864
639
728
864
864
728
728
728
0
879
864
864
864
864
728
887
864
592
728
815
815
907
919
864
864
864
864
831
864
-9
864
0
864
728
327
663
-9
887
879
959
728
0
907
723
907
864
831
320
728
592
728
728
864
864
775
943
864
864
728
728
864
864
728
864
864
864
864
864
799
919
959
-88
728
728
728
864
728
728
864
456
864
592
864
864
864
864
887
887
831
864
864
864
864
864
728
864
864
864
592
-13
779
-13
864
728
864
864
864
864
964
728
864
864
887
887
887
775
663
887
759
839
599
979
979
864
-1
959
959
-1
919
728
728
592
864
728
728
592
-8
928
-8
320
48
456
735
911
955
955
499
728
728
456
592
864
915
971
943
971
928
928
799
899
728
864
0
0
-1
-1
864
728
728
864
864
864
864
728
640
728
864
864
456
728
864
327
887
887
864
728
864
955
0
0
775
887
864
864
864
864
592
864
864
551
327
831
959
959
728
864
864
864
879
919
839
839
919
919
69