# Notebook Objective and Setup

BGG06 is where synthetic ratings are validated. The actual synthetic ratings are produced in an external file - similarity_process.py

This notebook involved refining the code for that external file, and then includes the validation checks to check the data produced in the external file.

## Package Imports

In [None]:
import pandas as pd
import numpy as np
import numba as nb
import requests
import regex as re
import time
import gc
import copy
import json
from statistics import mean

from numba import jit, cuda, prange, typeof, typed, types
from numpy.linalg import norm

from multiprocessing import Pool, Manager

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# from scipy import sparse
# from scipy.sparse import csr_matrix
# from scipy import spatial

# from sklearn.metrics.pairwise import cosine_similarity
# import sklearn.preprocessing as pp
# from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, normalize

import tensorflow as tf
from tensorflow.compat.v1.losses import cosine_distance
from tensorflow.keras.losses import CosineSimilarity

In [None]:
import multiprocessing as mp

ncpus = mp.cpu_count()
print("We have {} cores to work on!".format(ncpus))

## Notebook Functions

In [None]:
global_start = time.time()

In [None]:
# the basic file required for this work - the full matrix

# larger_matrix = pd.read_pickle('synthetic_ratings/users_synthetic_1000_fullmatrix.pkl')

In [None]:
# the basic file required for this work - the full matrix

larger_matrix = pd.read_pickle("real_ratings/users_real_unscaled_fullmatrix.pkl")

In [None]:
# convert full matrix to numpy and delete matrix

matrix_array = larger_matrix.to_numpy()

In [None]:
gameids_columnorder = list(larger_matrix.columns)
gameids_columnorder[:10]

In [None]:
del larger_matrix
gc.collect()

# Similarity Calculations - Jit Parallelization

In [None]:
# This cell is appending all results to a list in order
# As long as the inner function itself is not asynchronous, this should be fine


@jit(nopython=True, parallel=True, fastmath=True)
def math_function(game, matrix_array, all_games):

    results = []

    # make the single user matrix for the one user
    single_item = matrix_array[:, game].copy()
    # get the indices where the user is nonzero
    indices = np.nonzero(single_item)[0]

    for game2 in all_games:

        next_item = matrix_array[:, game2].copy()
        indices2 = np.nonzero(next_item)[0]

        common_indices = np.intersect1d(indices, indices2)

        if len(common_indices) < 4:
            results.append(0)
            continue

        else:
            a = single_item[common_indices].astype(np.float32)
            b = next_item[common_indices].astype(np.float32)

            try:
                item_similarity = a @ b.T / (norm(a) * norm(b))
                results.append(item_similarity)
            except:
                results.append(0)

    return results

In [None]:
all_games = np.arange(0, matrix_array.shape[1], 1)

games_range = len(all_games[:100])
games_range

In [None]:
all_games = np.arange(0, matrix_array.shape[1], 1)


# Load the storage dictionary for this block
# with open('item_similarities/similarity_storage_real_scaled_temp_1.json') as json_file:
#    base_items_storage = json.load(json_file)
base_items_storage_1 = {}

# for each user block in the block_indices_lookup. The user blocks are integers from 1-20
for game in all_games[:10]:

    print("\nStarting game: " + str(game))
    start = time.time()

    gameid_1 = gameids_columnorder[game]

    results = math_function(game, matrix_array, all_games)  # , results_a, results_b

    base_items_storage_1[gameid_1] = results

    end = time.time()

    print(end - start)

print(time.time() - global_start)

Time check no multiprocessing, 50 entries: 903


Time check first 300 in 3 notebooks:
2511+511+3171= 6193

2535+538+=3108

In [None]:
gameids_columnorder[258]

Time check first 300 in 3 Pool processes:
3219

In [None]:
# save dictionary
with open(
    "item_similarities/similarity_storage_real_scaled_test_0100.json", "w"
) as convert_file:
    convert_file.write(json.dumps(base_items_storage_1))

In [None]:
del matrix_array
gc.collect()

In [None]:
break

Time total: 50587 for 10000 entries

Time total: 582220 for all entries 250synth

In [None]:
time = 82220

time / 60 / 60

In [None]:
base_items_storage["174430"]

# Data Validation

## Load the multiprocessed file

In [None]:
# Load the storage dictionary for this block
with open("item_similarities/item_similarity_storage_real.json") as json_file:
    item_similarity_storage_real = json.load(json_file)

In [None]:
len(item_similarity_storage_real)

In [None]:
len(item_similarity_storage_real[0])

In [None]:
item_similarity_storage_real[0]["822"]

In [None]:
item_similarity_storage_real[0]["174430"]

## Load the manual files

In [None]:
# Load the storage dictionary for this block
with open(
    "item_similarities/similarity_storage_real_scaled_test_0100.json"
) as json_file:
    similarity_storage_real_scaled_test_0100 = json.load(json_file)

In [None]:
similarity_storage_real_scaled_test_0100["174430"]

In [None]:
# Load the storage dictionary for this block
with open(
    "item_similarities/similarity_storage_real_scaled_test_200300.json"
) as json_file:
    similarity_storage_real_scaled_test_200300 = json.load(json_file)

In [None]:
similarity_storage_real_scaled_test_200300["822"]

# Deprecated

## Similarity Calculations - Tensorflow (GPU only)

### Code Work - Item to Item tensors

In [None]:
larger_matrix.head()

In [None]:
matrix_array

In [None]:
matrix_array.shape

In [None]:
item1 = 63
item2 = 68

In [None]:
# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
single_item.shape

In [None]:
# get the indices where the item is nonzero
%time indices = list(np.nonzero(single_item)[0])
len(indices)

In [None]:
# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = list(np.nonzero(next_item)[0])

len(indices2)

In [None]:
%time common_indices = list(set.intersection(set(indices), set(indices2)))
len(common_indices)

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
# get the indices where the item is nonzero
%time indices = list(np.nonzero(single_item)[0])
# %time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = list(np.nonzero(next_item)[0])
# %time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
# %time common_indices = list(set.intersection(set(indices), set(indices2)))
%time common_indices = list(set.intersection(set(indices), set(indices2)))

end = time.time()

print(end - start)

#### Method - TF Matmul

In [None]:
# step 4
%time reduced_item1 = single_item[common_indices].reshape(1,-1)
%time reduced_item2 = next_item[common_indices].reshape(-1,1)

In [None]:
reduced_item1.shape, reduced_item2.shape

In [None]:
# step 5
%time a = tf.constant(reduced_item1, dtype=tf.float32)
%time b = tf.constant(reduced_item2, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

In [None]:
# step 6
item_similarity = round(float(tf.matmul(a, b)), 2)
item_similarity

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
# get the indices where the item is nonzero
%time indices = np.nonzero(single_item)[0]
# %time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = np.nonzero(next_item)[0]
# %time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
%time common_indices = list(set.intersection(set(indices), set(indices2)))

# step 4
%time reduced_item1 = single_item[common_indices].reshape(1,-1)
%time reduced_item2 = next_item[common_indices].reshape(-1,1)

# step 5
%time a = tf.constant(reduced_item1, dtype=tf.float32)
%time b = tf.constant(reduced_item2, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

# step 6
%time item_similarity = float(tf.matmul(a, b))

end = time.time()

print(end - start)

print(item_similarity)

#### Method- TF cosine_distance

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
# get the indices where the item is nonzero
%time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
%time common_indices = list(set.intersection(set(indices), set(indices2)))

# step 4
%time reduced_item1 = single_item[common_indices]
%time reduced_item2 = next_item[common_indices]

# step 5
%time a = tf.constant(reduced_item1, dtype=tf.float32)
%time b = tf.constant(reduced_item2, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

# step 6
%time item_similarity = 1-cosine_distance(a, b, axis=0).numpy()

end = time.time()

print(end - start)

print(item_similarity)

#### Method - TF Cosine similarity

In [None]:
cos_sim = CosineSimilarity()

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
# get the indices where the item is nonzero
%time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
%time common_indices = list(set.intersection(set(indices), set(indices2)))

# step 4
%time reduced_item1 = single_item[common_indices]
%time reduced_item2 = next_item[common_indices]

# step 5
%time a = tf.constant(reduced_item1, dtype=tf.float32)
%time b = tf.constant(reduced_item2, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

# step 6
%time item_similarity = cos_sim(a, b).numpy()

end = time.time()

print(end - start)

print(item_similarity)

## Make item-item calculations

In [None]:
number_of_games = np.arange(0, matrix_array.shape[1], 1)

time_test = []


# Load the storage dictionary for this block
with open("item_similarities/similarity_storage_real_scaled.json") as json_file:
    base_items_storage = json.load(json_file)

# for each user block in the block_indices_lookup. The user blocks are integers from 1-20
for game in number_of_games[:500]:

    print("\nStarting game: " + str(game))
    start = time.time()

    gameid_1 = gameids_columnorder[game]

    # make the single user matrix for the one user
    single_item = matrix_array[:, game]
    # get the indices where the user is nonzero
    indices = np.nonzero(single_item)[0]
    checkpoint1 = time.time()
    # print("Reduce item 1: "+str(checkpoint1-start))

    for game2 in number_of_games:

        if game == game2:
            continue

        gameid_2 = gameids_columnorder[game2]

        if gameid_2 in base_items_storage[gameid_1]:
            pass

        else:

            next_item = matrix_array[:, game2]
            indices2 = np.nonzero(next_item)[0]
            checkpoint3 = time.time()
            # print("\nReduce item 2: "+str(checkpoint3-checkpoint1))

            common_indices = list(set.intersection(set(indices), set(indices2)))

            if len(common_indices) < 3:
                checkpoint7 = time.time()
                item_similarity = 0

            else:

                reduced_item1 = single_item[common_indices]
                reduced_item2 = next_item[common_indices]
                checkpoint4 = time.time()
                # print("Reduce both to common indices: "+str(checkpoint4-checkpoint3))

                a = tf.constant(reduced_item1, dtype=tf.float32)
                b = tf.constant(reduced_item2, dtype=tf.float32)
                checkpoint5 = time.time()
                # print("Load to tensors: "+str(checkpoint5-checkpoint4))

                a = tf.nn.l2_normalize(a)
                b = tf.nn.l2_normalize(b)
                checkpoint6 = time.time()
                # print("Normalize tensors: "+str(checkpoint6-checkpoint5))

                item_similarity = 1 - cosine_distance(a, b, axis=0).numpy()
                checkpoint7 = time.time()
                # print("Get similarity: "+str(checkpoint7-checkpoint6))

            base_items_storage[gameid_1][gameid_2] = item_similarity
            base_items_storage[gameid_2][gameid_1] = item_similarity
            checkpoint8 = time.time()
            # print("Store similarity: "+str(checkpoint8-checkpoint7))

    end = time.time()

    print("Time for this game: " + str(end - start) + "\n")

# save dictionary
with open("item_similarities/similarity_storage_real_scaled.json", "w") as convert_file:
    convert_file.write(json.dumps(base_items_storage))

In [None]:
# save dictionary
with open("item_similarities/similarity_storage_real_scaled.json", "w") as convert_file:
    convert_file.write(json.dumps(base_items_storage))

In [None]:
base_items_storage["84776"]

## Code Work - User to User Tensors

### Preparing the user blocks and user storage dictionaries

In [None]:
"""users_list = list(larger_matrix.index)

user_blocks_lookup = {}

chunk_size = int(np.ceil(matrix_array.shape[0]/20))

start = 0
incrementer = 0

while start < matrix_array.shape[0]:
    
    end = start + chunk_size
    incrementer += 1
    
    user_blocks_lookup[incrementer] = users_list[start:end]

    start += chunk_size

print("\nLookup dictionary complete")"""

In [None]:
"""user_id_lookup = {}

increment=0
for user in users_list:
    
    increment+=1
    user_id_lookup[increment] = user"""

In [None]:
"""block_indices_lookup = {}

start = 0
incrementer = 0

while start < matrix_array.shape[0]:
    
    end = start + chunk_size
    incrementer += 1
    
    block_indices_lookup[incrementer] = {}
    block_indices_lookup[incrementer]['Start'] = start
    block_indices_lookup[incrementer]['End'] = end
    
    start += chunk_size

print("\nLookup dictionary complete")"""

In [None]:
del larger_matrix
gc.collect()

ONLY RUN THIS AGAIN IF THE USER LIST CHANGES !!!!!  THIS WILL RESET ALL STORAGE DICTIONARIES ON DISK

In [None]:
"""max_range = len(user_blocks_lookup)+1

for item in np.arange(1,max_range,1):
    
    storage_dict = {}
    
    for user in user_blocks_lookup[item]:
        storage_dict[user] = {}
    
    # save dictionary
    with open('synthetic_ratings/similarity_storage_synth_items2k'+str(item)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(storage_dict))
    
    del storage_dict"""

In [None]:
precompute_matrix = matrix_array[:134000].T
precompute_matrix2 = matrix_array[134000:].T

In [None]:
# TEMP STUFF

block_indices_lookup = {}

start = 0
incrementer = 0


end = start + chunk_size

block_indices_lookup[1] = {}
block_indices_lookup[1]["Start"] = start
block_indices_lookup[1]["End"] = end

print("\nLookup dictionary complete")

In [None]:
def process_user_block(array_chunk_a, matrix, indices):

    this_start = time.time()
    array_chunk_b = matrix[indices, :]  # .astype('float32'))
    checkpoint = time.time()

    a = tf.constant(array_chunk_a, dtype=tf.float32)

    b = tf.constant(array_chunk_b, dtype=tf.float32)

    checkpoint1 = time.time()
    # print(str(checkpoint1-checkpoint)+" Loaded into Tensors")

    a = tf.nn.l2_normalize(a, 1)
    b = tf.nn.l2_normalize(b, 0)

    checkpoint2 = time.time()
    # print(str(checkpoint2-checkpoint1)+" normalized")

    similarities = tf.matmul(a, b)
    checkpoint3 = time.time()
    # print(str(checkpoint3-checkpoint2)+" Got Similarity Scores")

    user_similarities = similarities.numpy().reshape(-1, 1)

    return user_similarities

In [None]:
time_test = []

# for each user block in the block_indices_lookup. The user blocks are integers from 1-20
for user_block in block_indices_lookup:

    print("Starting block " + str(user_block))

    # Get the start and end indexes for the block
    starting_block_indexes = block_indices_lookup[user_block]
    base_start = starting_block_indexes["Start"]  # starting user
    base_end = starting_block_indexes["End"]  # ending user

    # Load the storage dictionary for this block
    with open(
        "real_ratings/similarity_storage_real_" + str(user_block) + ".json"
    ) as json_file:
        base_users_storage = json.load(json_file)

    # only do the user ids in this block, then save to the fils
    for user_id in np.arange(base_start, 1001, 1):  # base_end, 1):
        print(user_id)

        user_name = user_id_lookup[user_id + 1]
        # print(user_name)

        # log start time
        # print("Making matrices")
        start = time.time()

        # make the single user matrix for the one user
        single_user = matrix_array[user_id].reshape(1, -1)
        # get the indices where the user is nonzero
        indices = list(np.nonzero(single_user)[1])
        # make the user with only the nonzero indices
        array_chunk_a = single_user[:, indices]  # .astype('float32')
        # normalize_a = normalize(array_chunk_a, axis=1)
        checkpoint = time.time()
        # print(str(checkpoint-start)+" Processed single user")

        # process_user_block(a, precompute_matrix, indices)
        user_similarities_1 = process_user_block(
            array_chunk_a, precompute_matrix, indices
        )
        user_similarities_2 = process_user_block(
            array_chunk_a, precompute_matrix2, indices
        )
        # user_similarities = process_user_block(array_chunk_a, precompute_matrix, indices)

        checkpoint3 = time.time()
        user_similarities = np.append(user_similarities_1, user_similarities_2)
        max_spot = np.argmax(user_similarities)
        mean_spot = np.median(user_similarities)
        user_similarities[max_spot] = mean_spot
        scaler = MinMaxScaler(feature_range=(-1, 1))
        user_similarities = scaler.fit_transform(
            user_similarities.reshape(-1, 1)
        ).ravel()
        # user_similarities = list(np.round(user_similarities, 2).ravel())

        checkpoint4 = time.time()
        # print(str(checkpoint4-checkpoint3)+" Processed/Scaled Similarity scores")

        over75 = list((user_similarities >= 0.6).nonzero()[0])
        under75 = list((user_similarities <= -0.6).nonzero()[0])
        all_comps = over75 + under75

        for item in all_comps:
            item = int(item)
            base_users_storage[user_name][item] = round(
                float(user_similarities[item]), 2
            )

        checkpoint5 = time.time()
        # print(str(checkpoint5-checkpoint4)+" Stored scores in dictionary\n")

        end = time.time()
        elapsed = end - start
        # print(str(elapsed)+' seconds elapsed for this user\n\n')
        time_test.append(elapsed)

    print("Saving dictionary for this set of users")
    # save dictionary
    with open(
        "real_ratings/similarity_storage_real_" + str(user_block) + ".json", "w"
    ) as convert_file:
        convert_file.write(json.dumps(base_users_storage))

    avg_time = mean(time_test)
    print("Average time per user: " + str(avg_time))

    del base_users_storage
    gc.collect()

In [None]:
base_users_storage["cfarrell"]

In [None]:
len(base_users_storage["Torsten"])

## Make smaller ratings blocks

In [None]:
# Opening JSON file
with open("real_ratings/user_ratings_unscaled.json") as json_file:
    user_ratings = json.load(json_file)

In [None]:
all_users = list(user_ratings.keys())

In [None]:
len(all_users)

In [None]:
user_block_1 = all_users[:40000]
user_block_2 = all_users[40000:80000]
user_block_3 = all_users[80000:120000]
user_block_4 = all_users[120000:160000]
user_block_5 = all_users[160000:200000]
user_block_6 = all_users[200000:240000]
user_block_7 = all_users[240000:]

user_blocks = [
    user_block_1,
    user_block_2,
    user_block_3,
    user_block_4,
    user_block_5,
    user_block_6,
    user_block_7,
]

In [None]:
iteration = 0

for block in user_blocks:

    iteration += 1

    print("Starting block " + str(iteration))

    block_of_users = {key: value for key, value in user_ratings.items() if key in block}

    # for scaled only:
    for person in block_of_users:
        # user_mean = mean(block_of_users[person].values())
        for item in block_of_users[person]:
            # new_value = round((block_of_users[person][item] - user_mean), 2)
            new_value = block_of_users[person][item]
            block_of_users[person][item] = new_value

    # save dictionary
    with open(
        "real_ratings/user_ratings_block_unscaled_" + str(iteration) + ".json", "w"
    ) as convert_file:
        convert_file.write(json.dumps(block_of_users))

    del block_of_users
    gc.collect()

In [None]:
del user_blocks
del user_ratings
gc.collect()

In [None]:
# make dataframe from synthetic sort and melt to longform
synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={"index": "BGGId", user: "Rating"}, inplace=True)
synthetic_user_ratings["Rating"] = synthetic_user_ratings["Rating"] + user_mean


synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
synthetic_user_ratings_long = synthetic_user_ratings.melt(
    id_vars="UserID", var_name="BGGId", value_name="Rating"
).dropna()
synthetic_user_ratings_long.sort_values("UserID", inplace=True)
synthetic_user_ratings_long

# save longform
synthetic_user_ratings_long.to_pickle(
    "synthetic_ratings_new_scraper/synthetic_ratings_" + path + "_" + number + ".pkl"
)

In [None]:
def produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create):
    """
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings

    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.

    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    """

    print("Producing items for user")

    # start at iteration 0
    iteration = 0

    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:
        users_comp_dict[item] = [1, 1, item, 0, 0, temp_users_dictionary[item]]
        # overall confidence, this item similarity, item, iteration, degrees away, item name

    # while the list of items that the user rated is < the number of ratings needed:
    while len(users_comp_dict.keys()) < num_ratings_create:

        users_rated_items = list(temp_users_dictionary.keys())

        iteration += 1  # advance the iteration

        new_items = []  # make a list to hold the items for this iteration

        # for each rated item:
        for rated in users_rated_items:

            print("\nCurrent item: " + str(rated))
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
            print(rated_rating)

            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]

            while current_comp in new_items:

                # increment position
                current_position += 1

                if current_position >= 21923:
                    # print(current_position)
                    break

                else:
                    # reset current comp to new position new_items
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue

            # any time the current comp is in users_rated_items already:
            while current_comp in users_comp_dict.keys():

                # increment position
                current_position += 1

                if current_position >= 21923:
                    # print(current_position)
                    break

                else:

                    # reset current comp to new position users_comp_dict
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue

            # The next section activates once the current comp is not already in the user's rated items

            if current_position >= 21923:
                # print(current_position)
                break

            else:

                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
                print(current_position)
                print(comp_similarity)

                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = rated_rating * comp_similarity
                print(synthetic_rating)

                # get the overall confidence of this rating
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity
                degrees = users_comp_dict[rated][4] + 1

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)

                # make the user's comp dict
                users_comp_dict[current_comp] = [
                    confidence,
                    comp_similarity,
                    rated,
                    iteration,
                    degrees,
                    synthetic_rating,
                ]

                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating

        continue

    print("End length of rated items is " + str(len(users_comp_dict)) + "\n")

    return users_comp_dict

In [None]:
# user_matrix = pd.read_pickle('data_store/data_cleaned/ratings_matrix_cleaned_03.pkl')
# user_matrix = user_matrix.T
# user_matrix.index = user_matrix.index.astype('int32')

In [None]:
# run the data synthesizer for each of the 6 ratings matrix files
process_to_synthetic(item, num_ratings_create, desired_ratings, game_ids, "250")

In [None]:
def get_user(user_items, user, game_ids):
    """
    Takes in user's rated items, a the username, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary
    Returns the user dictionary

    Inputs:
    user_items: dataframe column of user's rated items
    user: user to retrieve
    game_ids: the game_ids we are using in our recommender

    Outputs:
    overall_user: user dictionary with user's ratings
    """

    # get the mean rating for that user
    user_mean = user_items.mean()

    # normalize the ratings for that user by subtracting their mean from all ratings, store in list
    game_ratings_normed = list(user_items - user_mean)

    # Get a list of all of the game IDs that the user rated
    users_game_ids = list(user_items.index)

    # get the set of usable game ids
    game_ids_set = set(game_ids).intersection(set(users_game_ids))

    # make user storage dictionary
    user_ratings = {}

    # for the key/value pairs of game_ids and normalized ratings
    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value

    # make a dictionary to store the intersected ratings
    set_dictionary = {}

    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary

    return overall_user

In [None]:
def process_matrix_to_synthetic(
    path, num_ratings_create, desired_ratings, game_ids, number
):
    """
    Process a user matrix and create synthetic data for each user in the matrix

    Inputs:
    Path: path appendation for file
    num_ratings_create: The total number of minimum ratings per user
    desired_ratings: the needed number of ratings per user
    """

    # load and transpose data frame
    user_matrix = pd.read_pickle(
        "data_store/data_cleaned/ratings_matrix_cleaned_" + path + ".pkl"
    )
    user_matrix.drop_duplicates(keep="first", inplace=True)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype("int32")

    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}

    # for each user in the test matrix:
    for user in user_matrix.columns:

        print("Starting user " + user)

        user_items = user_matrix[user].dropna(axis=0)

        # copy the current user dictionary to a temp storage dictionary that we can manipulate
        synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
        temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])

        # get the original number of ratings by this user
        original_num_ratings = len(temp_users_dictionary)
        print("User starts with " + str(original_num_ratings) + " ratings")

        # call function to produce synthetic ratings
        user_comps_dict = produce_synthetic_ratings(
            user, temp_users_dictionary, num_ratings_create
        )
        # call sort function for top synthetic ratings
        sort_synthetic_ratings(
            user,
            synthetic_users_dictionary,
            user_comps_dict,
            original_num_ratings,
            desired_ratings,
        )

    # make dataframe from synthetic sort and melt to longform
    synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
    synthetic_user_ratings.reset_index(inplace=True)
    synthetic_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
    synthetic_user_ratings_long = synthetic_user_ratings.melt(
        id_vars="UserID", var_name="BGGId", value_name="Rating"
    ).dropna()
    synthetic_user_ratings_long.sort_values("UserID", inplace=True)
    synthetic_user_ratings_long

    # save longform
    synthetic_user_ratings_long.to_pickle(
        "synthetic_ratings_new_scraper/synthetic_ratings_"
        + path
        + "_"
        + number
        + ".pkl"
    )

    # save dictionary
    with open(
        "synthetic_ratings_new_scraper/users_dump_syntheticratings"
        + path
        + "_"
        + number
        + ".json",
        "w",
    ) as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))

In [None]:
def sort_synthetic_ratings(
    user,
    synthetic_users_dictionary,
    user_comps_dict,
    original_num_ratings,
    desired_ratings,
):
    """
    Takes the user's synthesized comps dict, the original number of ratings the user made,
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level,
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary

    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user

    """
    print("Sorting user items")

    # showing synthetic ratings only
    user_comps_df = (
        pd.DataFrame(
            user_comps_dict.values(),
            index=user_comps_dict.keys(),
            columns=[
                "OverallConfidence",
                "SimtoLast",
                "RecFrom",
                "Iteration",
                "DegreesAway",
                "SyntheticRating",
            ],
        )
        .sort_values("OverallConfidence", ascending=False)
        .drop_duplicates(keep="first")
    )

    # get a list of the ratings to keep (past the real ratings)
    keep_items = sorted(list(user_comps_df[:desired_ratings].index))

    # for each item that we keep,
    for item in keep_items:

        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_dict[item]

## Deprecated Matrix Calculations

### Using Matrix

In [None]:
larger_matrix = pd.read_pickle("synthetic_ratings/users_synthetic_2193_fullmatrix.pkl")

In [None]:
larger_matrix.head()

In [None]:
users = list(larger_matrix.index)
users[:5]

In [None]:
user_lookup_table = {}

user_key = -1

for user in users:

    user_key += 1

    user_lookup_table[int(user_key)] = users[user_key]

# save dictionary
with open("user_lookup_table.json", "w") as convert_file:
    convert_file.write(json.dumps(user_lookup_table))

In [None]:
%time values1 = larger_matrix.loc['Torsten'].values
values1[:10]

In [None]:
%time values2 = larger_matrix.loc['mitnachtKAUBO-I'].values
values2[:10]

In [None]:
%time spatial.distance.cosine(values1,values2)

In [None]:
del similarity_dictionary
gc.collect()

In [None]:
similarity_dictionary = {}

for user in users:

    similarity_dictionary[user] = {}

In [None]:
for user in users[:1]:

    start = time.time()
    user_values = larger_matrix.loc[user].values

    for other_user in users:

        if user in similarity_dictionary[other_user]:
            continue

        else:

            other_user_values = larger_matrix.loc[other_user].values
            similarity = 1 - spatial.distance.cosine(user_values, other_user_values)
            similarity_dictionary[user][other_user] = similarity
            similarity_dictionary[other_user][user] = similarity

    end = time.time()
    print(str(end - start) + " seconds")

In [None]:
del larger_matrix
gc.collect()

### Using Numpy Arrays

In [None]:
matrix_array = larger_matrix.to_numpy()

In [None]:
del larger_matrix
gc.collect()

In [None]:
matrix_array.shape

In [None]:
%time values1 = matrix_array[0]
values1[:10]

In [None]:
%time values2 = matrix_array[1]
values2[:10]

In [None]:
%time spatial.distance.cosine(values1,values2)

In [None]:
%time similarities = np.matmul(matrix_array[0:10000], matrix_array[0:10000].T)

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
%time similarities = np.dot(matrix_array[0:10000], matrix_array[0:10000].T)

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
%time similarities = matrix_array[0:10000]@matrix_array[0:10000].T

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
len_users = len(users)

In [None]:
del similarity_dictionary
gc.collect()

In [None]:
similarity_dictionary = {}

for user in np.arange(0, len_users, 1):

    similarity_dictionary[user] = {}

In [None]:
len(similarity_dictionary)

In [None]:
similarity_dictionary[0]

In [None]:
for user in np.arange(0, len_users, 1)[:1]:

    start = time.time()
    user_values = matrix_array[user].reshape(-1, 1)

    other_matrix = matrix_array[user + 1 :]

    similarities = cosine_similarity(other_matrix, user_values)

    end = time.time()
    print(str(end - start) + " seconds")

In [None]:
similarities.shape

In [None]:
for user in np.arange(0, len_users, 1)[:3]:

    start = time.time()
    user_values = matrix_array[user]

    for other_user in np.arange(0, len_users, 1):

        if user in similarity_dictionary[other_user]:
            continue

        else:

            other_user_values = matrix_array[other_user]
            similarity = 1 - spatial.distance.cosine(user_values, other_user_values)
            similarity_dictionary[user][other_user] = similarity
            similarity_dictionary[other_user][user] = similarity

    end = time.time()
    print(str(end - start) + " seconds")

In [None]:
similarity_dictionary[0]

In [None]:
similarity_dictionary[5]

In [None]:
del matrix_array
gc.collect()

In [None]:
# larger_matrix_T = pd.read_pickle('synthetic_ratings/users_synthetic_2193_fullmatrixT.pkl')

## Different ways to make calculations

In [None]:
matrix_sparsed = pd.read_pickle(
    "synthetic_ratings/users_synthetic_2193_sparsematrix.pkl"
)

In [None]:
matrix_sparsed.info()

In [None]:
matrix_sparsed.head()

In [None]:
users = list(matrix_sparsed.index)
users[:5]

### Chunks, sparse non-normalized

In [None]:
%time sparse_matrix = csr_matrix(matrix_sparsed.sparse.to_coo())

In [None]:
del matrix_sparsed
gc.collect()

In [None]:
type(sparse_matrix)

In [None]:
sparse_matrix

In [None]:
sparse_matrix.shape[0]

In [None]:
%time similarities = cosine_similarity(sparse_matrix[0:10000], sparse_matrix[0:10000], dense_output=True)

In [None]:
similarities[0]

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster
chunk_size = 10000
matrix_len = sparse_matrix.shape[0]


def similarity_cosine_by_chunk(start, end, dense):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(
        X=sparse_matrix[start:end], Y=sparse_matrix, dense_output=dense
    )  # scikit-learn function


# for chunk_start in range(0, 10, chunk_size):
# cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
%time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 10000, dense=True)

- Time for size 1, dense output: 39.4s
- Time for size 1000, dense output: 8min 48s
- Time for size 1, compact output: 47.8s
- Time for size 10000, compact output: 1h 41min 6s

In [None]:
cosine_similarity_chunk[:1]

In [None]:
sparse_matrix.shape

In [None]:
sparse_matrix[0:10000].shape

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster
chunk_size = 10000
matrix_len = sparse_matrix.shape[0]


def similarity_cosine_by_chunk(start, end, dense):
    if end > matrix_len:
        end = matrix_len
    return np.matmul(sparse_matrix[start:end], sparse_matrix)  # scikit-learn function


# for chunk_start in range(0, 10, chunk_size):
# cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
# %time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 10000, dense=True)

In [None]:
similarities[:1]

### Chunks, normalized

In [None]:
normed_matrix = pp.normalize(sparse_matrix.tocsc(), axis=0)
del sparse_matrix
gc.collect()

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster
chunk_size = 1000
matrix_len = normed_matrix.shape[0]


def similarity_cosine_by_chunk(start, end, dense=False):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(
        X=normed_matrix[start:end], Y=normed_matrix, dense_output=dense
    )  # scikit-learn function


# for chunk_start in range(0, 10, chunk_size):
# cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
%time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 1, dense=False)

Time for size 1, dense output: 1min 51s
Time for size 1000, dense output: 10min 20s
Time for size 1, compact output: 1min 51s


In [None]:
cosine_similarity_chunk.shape

In [None]:
cosine_similarity_chunk[:1]

In [None]:
user = "Torsten"

%time sparse_user =  csr_matrix(matrix_sparsed.loc[user])

In [None]:
sparse_user.T

In [None]:
sparseuser_AB = sparse_matrix.multiply(sparse_user)

In [None]:
sparseuser_AB

### Old function with comparison blocks

In [None]:
for user_block in block_indices_lookup:

    print("Starting block " + str(user_block))

    starting_block_indexes = block_indices_lookup[user_block]
    base_start = starting_block_indexes["Start"]
    base_end = starting_block_indexes["End"]

    array_chunk_a = (matrix_array[base_start:base_end] / 10).astype("float32")

    # Opening JSON file
    with open(
        "user_similarities/similarity_storage" + str(user_block) + ".json"
    ) as json_file:
        base_users_storage = json.load(json_file)

    first_block_of_comparison = user_block
    end_range = len(block_indices_lookup) + 1

    # TEMPORARY END RANGE FOR TESTINGS
    end_range = 2

    for comparison_block in np.arange(first_block_of_comparison, end_range, 1):

        print(
            "User Block "
            + str(user_block)
            + " vs Comparison Block "
            + str(comparison_block)
        )

        # Opening JSON file
        with open(
            "user_similarities/similarity_storage" + str(comparison_block) + ".json"
        ) as json_file:
            comparison_users_storage = json.load(json_file)

        comparison_indexes = block_indices_lookup[comparison_block]
        compare_start = comparison_indexes["Start"]
        compare_end = comparison_indexes["End"]

        print("Making matrices")
        start = time.time()
        array_chunk_b = ((matrix_array[compare_start:compare_end].T) / 10).astype(
            "float32"
        )

        a = tf.constant(array_chunk_a)
        b = tf.constant(array_chunk_b)

        normalize_a = tf.nn.l2_normalize(a, 1)
        del a
        gc.collect()

        normalize_b = tf.nn.l2_normalize(b, 0)
        del b
        gc.collect()

        print("Getting similarity scores")
        similarities = tf.matmul(normalize_a, normalize_b)  # , adjoint_b=True)
        del normalize_a
        del normalize_b
        gc.collect()

        # store user info

        incrementer_base = 0

        print("Storing Similarities")
        for base_user in user_blocks_lookup[user_block][:5]:

            print(base_user)

            user_similarities = similarities[incrementer_base].numpy()
            max_spot = np.argmax(user_similarities.max())
            mean_spot = np.median(user_similarities)
            user_similarities[max_spot] = mean_spot
            scaler = MinMaxScaler(feature_range=(-1, 1))
            user_similarities = scaler.fit_transform(user_similarities.reshape(-1, 1))
            user_similarities = list(np.round(user_similarities, 2).ravel())

            for key, value in list(
                zip(
                    user_blocks_lookup[comparison_block][incrementer_base:],
                    user_similarities[incrementer_base:],
                )
            ):
                if value >= 0.25 or value <= -0.25:
                    base_users_storage[base_user][key] = float(value)
                if user_block != comparison_block:
                    comparison_users_storage[key][base_user] = float(value)

            incrementer_base += 1

            # save dictionary
            with open(
                "user_similarities/similarity_storage"
                + str(comparison_block)
                + ".json",
                "w",
            ) as convert_file:
                convert_file.write(json.dumps(comparison_users_storage))

        print("Cleaning up memory for this iteration")
        del comparison_users_storage
        # del similarities
        gc.collect()

        end = time.time()
        print(str(end - start) + " seconds elapsed for this comparison section")

    # save dictionary
    with open(
        "user_similarities/similarity_storage" + str(user_block) + ".json", "w"
    ) as convert_file:
        convert_file.write(json.dumps(base_users_storage))

    # del base_users_storage
    gc.collect()

## Deprecated Tensorflow time reduction attempts

In [None]:
# the basic file required for this work - the full matrix

larger_matrix = pd.read_pickle(
    "synthetic_ratings/users_synthetic_2193_sparsematrix_nogameids.pkl"
)

In [None]:
larger_matrix.info()

In [None]:
larger_matrix.head()

In [None]:
# Make sparse dataframe into numpy array

matrix_array = np.array(larger_matrix)

Turn single user into a column 21921, 1

In [None]:
user_id = 3

In [None]:
# Get single user from matrix_array

%time single_user = matrix_array[user_id]
single_user.shape

In [None]:
# Get nonzero indices for user
%time indices = list(np.nonzero(single_user)[0])
indices

In [None]:
# make reduced array for user of nonzero indices
%time array_chunk_a = (single_user[indices]).astype('float32').reshape(-1,1)
array_chunk_a.shape

In [None]:
# normalize user
%time normalize_a = normalize(array_chunk_a, axis=0)
normalize_a

Investigate methods of reducing dataframe or array

In [None]:
# make reduced on sparse dataframe
%time df_chunk_b = larger_matrix[indices]

In [None]:
df_chunk_b.shape

In [None]:
df_chunk_b.info()

In [None]:
# make reduced on array
%time array_chunk_b = matrix_array[:, indices]

In [None]:
array_chunk_b.shape

In [None]:
# turn array into sparse matrix
sparse_matrix = sparse.csr_matrix(matrix_array)

In [None]:
# make reduced on sparse
%time array_chunk_b = sparse_matrix[:, indices]

Convert dataframe to array

In [None]:
# convert reduced dataframe to sparse matrix

%time sparse_array = sparse.csr_matrix(df_chunk_b.sparse.to_coo())

In [None]:
# convert reduced dataframe to array
%time array_b_matrix = df_chunk_b.to_numpy()

In [None]:
array_b_matrix[0][:10]

Investigate normalization methods

In [None]:
# sklearn normalize on dataframe
%time normalize_b = normalize(df_chunk_b, axis=1)

In [None]:
normalize_b[0]

In [None]:
# sklearn normalize on array
%time normalize_b = normalize(array_b_matrix, axis=1)

In [None]:
normalize_b[0][:10]

In [None]:
# make partial dataframe segment
%time partial_df = df_chunk_b[:134400]

In [None]:
# make partial array segment
%time partial_array = normalize_b[:134400]

## Deprecated Parallelization Techniques

### Test common indices

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1]
# get the indices where the item is nonzero
%time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2]
# get the indices where the item is nonzero
%time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
%time common_indices = list(set.intersection(set(indices), set(indices2)))

# step 4
%time reduced_item1 = single_item[common_indices]
%time reduced_item2 = next_item[common_indices]

# step 5
%time a = tf.constant(reduced_item1, dtype=tf.float32)
%time b = tf.constant(reduced_item2, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

# step 6
%time item_similarity = 1-cosine_distance(a, b, axis=0).numpy()

end = time.time()

print(end - start)

print(item_similarity)

In [None]:
reduced_item1

In [None]:
reduced_item2

### Test xor1d indices

In [None]:
start = time.time()

# Step one
# make the single item matrix for the one item
%time single_item = matrix_array[:, item1].copy()
# get the indices where the item is nonzero
%time indices = np.nonzero(single_item)[0]

# Step 2
# make the single item matrix for the next item
%time next_item = matrix_array[:, item2].copy()
# get the indices where the item is nonzero
%time indices2 = np.nonzero(next_item)[0]

# step 3
# get the indices in common between the two
%time indices_diff = np.setxor1d(indices, indices2)

%time first_item = single_item.copy()
# step 4
%time first_item[[indices_diff]]=0
%time next_item[[indices_diff]]=0

# step 5
%time a = tf.constant(first_item, dtype=tf.float32)
%time b = tf.constant(next_item, dtype=tf.float32)
%time a = tf.nn.l2_normalize(a)
%time b = tf.nn.l2_normalize(b)

# step 6
%time item_similarity = 1-cosine_distance(a, b, axis=0).numpy()

end = time.time()

print(end - start)

print(item_similarity)

### No Jit and xor1d indices

In [None]:
# @jit(nopython=True, parallel=True, fastmath=True)
def math_function(game, matrix_array, number_of_games):

    # results = []
    results_a = []
    results_b = []

    # make the single user matrix for the one user
    single_item = matrix_array[:, game].copy()
    # get the indices where the user is nonzero
    indices = np.nonzero(single_item)[0]

    for game2 in number_of_games:

        next_item = matrix_array[:, game2].copy()
        indices2 = np.nonzero(next_item)[0]

        indices_diff = np.setxor1d(indices, indices2)

        # if len(common_indices)<4:
        # results.append(0)
        # continue

        first_item = single_item.copy()
        first_item[[indices_diff]] = 0
        next_item[[indices_diff]] = 0

        # step 5
        # a = tf.constant(first_item, dtype=tf.float32)
        # b = tf.constant(next_item, dtype=tf.float32)
        # a = tf.nn.l2_normalize(a)
        # b = tf.nn.l2_normalize(b)

        # step 6
        # item_similarity = 1-cosine_distance(a, b, axis=0).numpy()

        # results.append(item_similarity)
        results_a.append(first_item)
        results_b.append(next_item)

    return results_a, results_b

In [None]:
number_of_games = np.arange(0, matrix_array.shape[1], 1)

global_start = time.time()

# Load the storage dictionary for this block
with open("item_similarities/similarity_storage_real_scaled_temp.json") as json_file:
    base_items_storage = json.load(json_file)

# for each user block in the block_indices_lookup. The user blocks are integers from 1-20
for game in number_of_games[0:100]:

    print("\nStarting game: " + str(game))
    start = time.time()

    gameid_1 = gameids_columnorder[game]

    results_a, results_b = math_function(game, matrix_array, number_of_games)

    # base_items_storage[gameid_1]['Sims'] = results
    base_items_storage[gameid_1]["Left"] = results_a
    base_items_storage[gameid_1]["Right"] = results_b

    end = time.time()

    print(end - start)

print(time.time() - global_start)

No GPU. No filtering: 2572sec for 97 entries
No GPU. Filtering < 4: 2453sec for 97 entries