# Notebook Objective and Setup

BGG06 is where synthetic ratings are produced for each user, using the content-based item filter from BGG05.

## Package Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

#from scipy import sparse
#from scipy.sparse import csr_matrix
#from scipy import spatial

#from sklearn.metrics.pairwise import cosine_similarity
#import sklearn.preprocessing as pp
from sklearn.preprocessing import MinMaxScaler#, OneHotEncoder, StandardScaler, PolynomialFeatures, 

In [None]:
import tensorflow as tf
from tensorflow.compat.v1.losses import cosine_distance

## Notebook Functions

In [None]:
def produce_synthetic_ratings_all(user, num_ratings_create, game_ids):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    
    start = time.time()
    
    #print("Producing items for user")
    
    user_items = user_ratings[user]
    user_mean = users_means[user]
    
    temp_users_dictionary = {}
    
    # copy the current user dictionary to a temp storage dictionary that we can manipulate

    for item in user_ratings[user]:
        this_rating = round((user_ratings[user][item]-user_mean), 1)
        temp_users_dictionary[int(item)] = this_rating
        synthetic_users_dictionary[user][int(item)] = int(this_rating*10)
        
    
    # get the original number of ratings by this user
    original_num_ratings = len(temp_users_dictionary)
    
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, 0, temp_users_dictionary[item]]
        #overall confidence, this item similarity, item, iteration, degrees away, item name
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(temp_users_dictionary.keys()) < num_ratings_create:
        
        start_set_length = len(temp_users_dictionary.keys())
        
        users_rated_items = list(temp_users_dictionary.keys())
        #print(len(users_rated_items))
        
        iteration += 1 # advance the iteration
        
        #print("Starting iteration "+str(iteration))
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            #print("Current item: "+str(rated))
            
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 10000:
                    #print(current_position)
                    break
                                                        
                else:
                    # reset current comp to new position new_items
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in temp_users_dictionary.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 10000:
                    #print(current_position)
                    break
                                    
                else:
                
                    # reset current comp to new position users_comp_dict
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 10000:
                #print(current_position)
                break
                            
            else:
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
                
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = round((rated_rating * comp_similarity), 1)
        
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity
                degrees = users_comp_dict[rated][4] + 1

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, degrees, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
                
                # add to synthetic users
                synthetic_users_dictionary[user][current_comp] = int(synthetic_rating*10)
               
        end_set_length = len(temp_users_dictionary.keys())
            
        if start_set_length == end_set_length:
            
            break
        
        continue
       
    end = time.time()
    print(str(end-start)+' seconds for user.\n')
    
    return users_comp_dict, temp_users_dictionary

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    
    # Use this one when you want only exactly x ratings and don't want to necessarily keep everything produced
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = list(user_comps_df[original_num_ratings:desired_ratings].index)

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_df.loc[item]['SyntheticRating']
    

In [None]:
def populate_all_ratings(user, synthetic_users_dictionary, temp_users_dictionary):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''   
    
    
    
    
    
    not_rated = list(set(game_ids) - set(temp_users_dictionary.keys()))
    print(str(len(not_rated))+" games were not rated")
            
    for item in not_rated:
        temp_users_dictionary[item] = 0
        users_comp_dict[item] = [0, 0, 0, iteration, 0, 0]
    
    print("End length of rated items is "+str(len(temp_users_dictionary)))
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = sorted(list(temp_users_dictionary.keys()))

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = temp_users_dictionary[item]
    

## Required Data Load

In [None]:
# read games for game_ids
games = pd.read_pickle('data_cleaned_new_scraper/games.pkl')
game_ids = list(games['BGGId'])

In [None]:
# Read cosine similarity pickle
sims_byid = pd.read_pickle('data_cleaned_new_scraper/game_cosine_similarity_byid.pkl')

In [None]:
# Opening JSON file
with open('data_cleaned_new_scraper/user_means.json') as json_file:
    users_means = json.load(json_file)

In [None]:
# Opening JSON file
with open('data_cleaned_new_scraper/user_ratings.json') as json_file:
    user_ratings = json.load(json_file)

all_users = list(user_ratings.keys())

user_block_1 = all_users[:40000]
user_block_2 = all_users[40000:80000]
user_block_3 = all_users[80000:120000]
user_block_4 = all_users[120000:160000]
user_block_5 = all_users[160000:200000]
user_block_6 = all_users[200000:240000]
user_block_7 = all_users[240000:]

user_blocks = [user_block_1, user_block_2, user_block_3, user_block_4, user_block_5, user_block_6, user_block_7]

del user_ratings
gc.collect()

In [None]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle('data_cleaned_new_scraper/games.pkl')

# lists of game ids and game names
game_ids = list(games['BGGId'])
game_names = list(games['Name'])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item

    
del games
gc.collect()

In [None]:
len(game_id_lookup)

In [None]:
# get top 1000 most similar games for each game and store in dictionary

game_comps_byid_lookup = {}

for item in sims_byid.columns:
    results = pd.DataFrame(data={'Similarity': sims_byid[item].sort_values(ascending=False)[1:]})
    current_cap = results['Similarity'].max()
    comps_index = list(results[:5000].index.astype('int32'))
    comps_similarity = list(results[:5000]['Similarity'])
    game_comps_byid_lookup[item] = [comps_index, comps_similarity]

In [None]:
del sims_byid
gc.collect()

In [None]:
#with open('game_comps_byid_lookup.json', 'w') as convert_file:
#     convert_file.write(json.dumps(game_comps_byid_lookup))

In [None]:
# Opening JSON file
#with open('game_comps_byid_lookup.json') as json_file:
#    game_comps_byid_lookup = json.load(json_file)

In [None]:
# number of synthetic ratings to produce
#num_ratings_create = 200

# number of ratings we will end up using
#desired_ratings = 50

# Produce Synthetic Ratings

## Test One User

In [None]:
# Opening JSON file
with open('data_cleaned_new_scraper/user_ratings.json') as json_file:
    user_ratings = json.load(json_file)

In [None]:
user = 'Monika1234'
user_mean = users_means[user]

In [None]:
user = 'Torsten'
user_mean = users_means[user]

In [None]:
this_user = pd.DataFrame(user_ratings[user].values(), index=user_ratings[user].keys())
this_user.reset_index(inplace=True)
this_user.rename(columns={0:'Rating', 'index':'BGGId'}, inplace=True)
this_user['Game'] = this_user['BGGId'].astype('int32').map(game_id_lookup)
this_user.sort_values('Game', ascending=True).head(30)

In [None]:
len(game_ids)

In [None]:
# number of synthetic ratings to produce
num_ratings_create = 2500

# number of ratings we will end up using
desired_ratings = 2500

In [None]:
del synthetic_users_dictionary
gc.collect()

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}
synthetic_users_dictionary[user] = {}

In [None]:
print("Starting user "+user)

# call function to produce synthetic ratings
user_comps_dict, temp_users_dictionary  = produce_synthetic_ratings_all(user, num_ratings_create, game_ids) 

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = (temp2[0]/10)+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'Iteration', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')

user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(30)

In [None]:
user_comps_df.info()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 1.5) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.5, y=(user_mean+.2), s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title(str(desired_ratings)+" Synthetic Ratings for a 10-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=20)
plt.ylabel("Rating", fontsize=20)


plt.tight_layout
#plt.savefig('images/synthetic_from10.png')
plt.show()
;

In [None]:
del synthetic_users_dictionary
del user_comps_df
del temp_users_dictionary
del this_user
del user_ratings
del user_comps_dict

gc.collect()

## Process ALL Users

In [None]:
len(game_ids)

In [None]:
# number of synthetic ratings to produce
num_ratings_create = 250

# number of ratings we will end up using
desired_ratings = 250

In [None]:
block_marker = 0

for block in user_blocks:

    block_marker +=1
    
    # Opening JSON file
    with open('data_cleaned_new_scraper/user_ratings_block_'+str(block_marker)+'.json') as json_file:
        user_ratings = json.load(json_file)
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}
    
    user_count = 0
    
    for user in block:
        print(user)
        user_count+=1
        
        synthetic_users_dictionary[user] = {}
   
        print("Starting user "+str(user_count))
               
        # call function to produce synthetic ratings
        user_comps_dict, temp_users_dictionary = produce_synthetic_ratings_all(user, num_ratings_create, game_ids) 
    
        #sort_synthetic_ratings(user, synthetic_users_dictionary, temp_users_dictionary)
    
        del user_comps_dict
        del temp_users_dictionary
        #gc.collect()

    # save dictionary
    with open('synthetic_ratings/users_synthetic_250_'+str(block_marker)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))
    
    del synthetic_users_dictionary
    gc.collect()

In [None]:
del user_ratings
del game_comps_byid_lookup
del user_blocks

gc.collect()

# Produce Matrices

In [None]:
larger_matrix = pd.DataFrame()

In [None]:
for append in range(1, 8):
    
    print("Opening file "+str(append))
    with open('synthetic_ratings/users_synthetic_250_'+str(append)+'.json') as json_file:
        set_of_ratings = json.load(json_file)
        
    print("Converting file to DF")
    matrix = pd.DataFrame(set_of_ratings).T

    print("Clearing memory")
    del set_of_ratings
    gc.collect()

    print("Filling NaN")
    matrix.fillna(0, inplace=True)
    
    print("Converting to Int8")
    matrix = matrix.astype('int8') 
    
    #print("Converting to sparse")
    #matrix_sparsed = matrix.astype(pd.SparseDtype("float32"))
    
    print("Adding to larger DF")
    larger_matrix = larger_matrix.append(matrix)
           
    print(larger_matrix.shape)

In [None]:
larger_matrix.fillna(0, inplace=True)
larger_matrix = larger_matrix.astype('int8')
#larger_matrix = larger_matrix.astype(pd.SparseDtype("float32"))

In [None]:
larger_matrix.head()

In [None]:
larger_matrix.info()

In [None]:
larger_matrix.to_pickle('synthetic_ratings/users_synthetic_250_fullmatrix.pkl')

# Similarity Calculations - Tensorflow (GPU only)

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

from scipy import sparse
from scipy.sparse import csr_matrix
from scipy import spatial

#from sklearn.metrics.pairwise import cosine_similarity
#import sklearn.preprocessing as pp
#from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, normalize

import tensorflow as tf
from tensorflow.compat.v1.losses import cosine_distance

In [2]:
# the basic file required for this work - the full matrix

larger_matrix = pd.read_pickle('synthetic_ratings/users_synthetic_1k_fullmatrix.pkl')

In [3]:
# convert full matrix to numpy and delete matrix

matrix_array = larger_matrix.to_numpy()

## Preparing the user blocks and user storage dictionaries

In [4]:
users_list = list(larger_matrix.index)

user_blocks_lookup = {}

chunk_size = int(np.ceil(matrix_array.shape[0]/20))

start = 0
incrementer = 0

while start < matrix_array.shape[0]:
    
    end = start + chunk_size
    incrementer += 1
    
    user_blocks_lookup[incrementer] = users_list[start:end]

    start += chunk_size

print("\nLookup dictionary complete")


Lookup dictionary complete


In [5]:
user_id_lookup = {}

increment=0
for user in users_list:
    
    increment+=1
    user_id_lookup[increment] = user

In [6]:
block_indices_lookup = {}

start = 0
incrementer = 0

while start < matrix_array.shape[0]:
    
    end = start + chunk_size
    incrementer += 1
    
    block_indices_lookup[incrementer] = {}
    block_indices_lookup[incrementer]['Start'] = start
    block_indices_lookup[incrementer]['End'] = end
    
    start += chunk_size

print("\nLookup dictionary complete")


Lookup dictionary complete


In [7]:
del larger_matrix
gc.collect()

85

ONLY RUN THIS AGAIN IF THE USER LIST CHANGES !!!!!  THIS WILL RESET ALL STORAGE DICTIONARIES ON DISK

In [8]:
'''max_range = len(user_blocks_lookup)+1

for item in np.arange(1,max_range,1):
    
    storage_dict = {}
    
    for user in user_blocks_lookup[item]:
        storage_dict[user] = {}
    
    # save dictionary
    with open('user_similarities/similarity_storage_1k_'+str(item)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(storage_dict))
    
    del storage_dict'''

##### Files:

- user_blocks_lookup  dict in format  dict[file_append]:[list of users in block]
- user_id_lookup  dict in format dict[user_id] = username
- block_indices_lookup  dict in format dict[file_append]: {'Start': start index, 'End': end index}
- storage dictionaries located at 'user_similarities/similarity_storage'+str(file_append)+'.json'
- matrix_array  numpy array which must be numerically indexed

#### Calculation Steps

- Set up matrix blocks a and b
    - b should already be transposed
- Load matrices into tensors a and b
- Normalize each tensor on axis 1 and del variables as they are used
- Do matmul an normed a and b
- Profit

In [9]:
# TEMP STUFF

block_indices_lookup = {}

start = 0
incrementer = 0


end = start + chunk_size
    
block_indices_lookup[1] = {}
block_indices_lookup[1]['Start'] = start
block_indices_lookup[1]['End'] = end

print("\nLookup dictionary complete")


Lookup dictionary complete


In [10]:
precompute_matrix = matrix_array[:134000].T
precompute_matrix2 = matrix_array[134000:].T

In [11]:
def process_user_block(array_chunk_a, matrix, indices):
    
    this_start = time.time()
    # make additional array parts to compare user against. Memory limitations here.
    #array_chunk_b = ((matrix[indices, :]).astype('float32'))
    array_chunk_b = (matrix[indices, :])#.astype('float32'))
    checkpoint = time.time()
    #print(str(checkpoint-this_start)+" Made compacted array")
    
    #normalize_b = normalize(array_chunk_b, axis=0)
    
    a = tf.constant(array_chunk_a, dtype=tf.float32)

    #b = tf.constant(array_chunk_b)
    #b = tf.constant(normalize_b)
    
    b = tf.constant(array_chunk_b, dtype=tf.float32)
    
    #b_norm = np.linalg.norm(array_chunk_b, axis=0)
    #b = b/b_norm
    
    checkpoint1 = time.time()      
    #print(str(checkpoint1-checkpoint)+" Loaded into Tensors")

    a = tf.nn.l2_normalize(a, 1)
    b = tf.nn.l2_normalize(b, 0)
    
    checkpoint2 = time.time()      
    #print(str(checkpoint2-checkpoint1)+" normalized")    
    
    similarities = tf.matmul(a, b)
    checkpoint3 = time.time()
    #print(str(checkpoint3-checkpoint2)+" Got Similarity Scores")
           
    user_similarities = similarities.numpy().reshape(-1,1)
        
    return user_similarities

In [12]:
time_test = []

# for each user block in the block_indices_lookup. The user blocks are integers from 1-20
for user_block in block_indices_lookup:
       
    print("Starting block "+str(user_block))
    
    # Get the start and end indexes for the block
    starting_block_indexes = block_indices_lookup[user_block]
    base_start = starting_block_indexes['Start'] # starting user
    base_end = starting_block_indexes['End'] # ending user
    
    # Load the storage dictionary for this block
    with open('user_similarities/similarity_storage_1k_'+str(user_block)+'.json') as json_file:
        base_users_storage = json.load(json_file) 
    
    # only do the user ids in this block, then save to the fils
    for user_id in np.arange(base_start, 1001, 1):#base_end, 1):
        print(user_id)
        
        user_name = user_id_lookup[user_id+1]
        #print(user_name)
    
        # log start time
        #print("Making matrices")
        start = time.time()
               
        # make the single user matrix for the one user
        single_user = matrix_array[user_id].reshape(1,-1)
        # get the indices where the user is nonzero
        indices = list(np.nonzero(single_user)[1])
        # make the user with only the nonzero indices
        array_chunk_a = (single_user[:, indices])#.astype('float32')
        #normalize_a = normalize(array_chunk_a, axis=1)
        checkpoint = time.time()
        #print(str(checkpoint-start)+" Processed single user")
        
        #a = tf.constant(array_chunk_a)
        #a = tf.nn.l2_normalize(a, 1)
        
        #process_user_block(a, precompute_matrix, indices)
        user_similarities_1 = process_user_block(array_chunk_a, precompute_matrix, indices)
        user_similarities_2 = process_user_block(array_chunk_a, precompute_matrix2, indices)
        #user_similarities = process_user_block(array_chunk_a, precompute_matrix, indices)
        
        
        checkpoint3 = time.time()
        user_similarities = np.append(user_similarities_1, user_similarities_2)   
        max_spot = np.argmax(user_similarities)
        mean_spot = np.median(user_similarities)
        user_similarities[max_spot] = mean_spot
        scaler = MinMaxScaler(feature_range=(-1,1))
        user_similarities = scaler.fit_transform(user_similarities.reshape(-1,1)).ravel()
        #user_similarities = list(np.round(user_similarities, 2).ravel())
        
        checkpoint4 = time.time()
        #print(str(checkpoint4-checkpoint3)+" Processed/Scaled Similarity scores") 
        
        over75 = list((user_similarities >= .6).nonzero()[0])
        under75 = list((user_similarities <= -.6).nonzero()[0])
        all_comps = over75 + under75
        
        for item in all_comps:
            item = int(item)
            base_users_storage[user_name][item] = float(user_similarities[item])
          
        checkpoint5 = time.time()
        #print(str(checkpoint5-checkpoint4)+" Stored scores in dictionary\n")
        
        end = time.time()
        elapsed = end-start
        #print(str(elapsed)+' seconds elapsed for this user\n\n')
        time_test.append(elapsed)
    
    print("Saving dictionary for this set of users")
    # save dictionary
    with open('user_similarities/similarity_storage_1k_'+str(user_block)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(base_users_storage))
    
    avg_time = mean(time_test)
    print("Average time per user: "+str(avg_time))
    
    del base_users_storage
    gc.collect()

Starting block 1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
2

In [None]:
base_users_storage['cfarrell']

In [None]:
len(base_users_storage['Torsten'])

# Make User Means Dict

In [3]:
# Opening JSON file
with open('data_cleaned/user_ratings.json') as json_file:
    user_ratings = json.load(json_file)

In [4]:
len(user_ratings)

263503

In [5]:
user_means = {}

In [6]:
for person in user_ratings:
    user_items = []
    for item in user_ratings[person]:
        user_items.append(user_ratings[person][item])
    user_mean = round((mean(user_items)), 1)
    user_means[person] = user_mean

In [7]:
user_means['Threnody']

7.1

In [8]:
user_means['moosh21']

6.0

In [9]:
user_means['Shade92008']

6.7

In [10]:
user_means['Torsten']

6.8

In [11]:
# save dictionary
with open('data_cleaned/user_means.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_means))

In [None]:
del user_means
gc.collect()

In [3]:
# Opening JSON file
with open('data_cleaned/user_means.json') as json_file:
    user_means_dict = json.load(json_file)

In [11]:
user_means = pd.DataFrame.from_dict(user_means_dict, orient='index')
user_means.rename(columns={0:'Mean'}, inplace=True)
user_means.head()

Unnamed: 0,Mean
Torsten,6.8
mitnachtKAUBO-I,6.6
avlawn,6.4
Mike Mayer,6.7
Mease19,7.0


In [None]:
user_means.to_pickle('data_cleaned/user)mea')

# Make Ratings Block Sets

In [10]:
# Opening JSON file
with open('data_cleaned/user_ratings.json') as json_file:
    user_ratings = json.load(json_file)

## Make scaled ratings

In [11]:
user_ratings_scaled = {}

for person in user_ratings:
    user_ratings_scaled[person] = {}
    user_mean = mean(user_ratings[person].values())
    for item in user_ratings[person]:
        new_value = round((user_ratings[person][item] - user_mean), 2)
        user_ratings_scaled[person][item] = new_value

In [12]:
# save dictionary
with open('real_ratings/real_user_ratings_dictionary_scaled.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_ratings_scaled))

## Make smaller ratings blocks

In [5]:
all_users = list(user_ratings.keys())

In [6]:
len(all_users)

263503

In [7]:
user_block_1 = all_users[:40000]
user_block_2 = all_users[40000:80000]
user_block_3 = all_users[80000:120000]
user_block_4 = all_users[120000:160000]
user_block_5 = all_users[160000:200000]
user_block_6 = all_users[200000:240000]
user_block_7 = all_users[240000:]

user_blocks = [user_block_1, user_block_2, user_block_3, user_block_4, user_block_5, user_block_6, user_block_7]

In [8]:
iteration = 0

for block in user_blocks:
    
    iteration += 1
    
    print("Starting block "+str(iteration))
    
    block_of_users = {key: value for key, value in user_ratings.items() if key in block}
    
    #for scaled only:
    for person in block_of_users:
        user_mean = mean(block_of_users[person].values())
        for item in block_of_users[person]:
            new_value = round((block_of_users[person][item] - user_mean), 2)
            block_of_users[person][item] = new_value
    
    # save dictionary
    with open('data_cleaned/user_ratings_block_scaled_'+str(iteration)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(block_of_users))
        
    del block_of_users
    gc.collect()

Starting block 1
Starting block 2
Starting block 3
Starting block 4
Starting block 5
Starting block 6
Starting block 7


In [9]:
del user_blocks
del user_ratings
gc.collect()

21

# Deprecated

In [None]:
# make dataframe from synthetic sort and melt to longform
synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean
    
    
synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synthetic_user_ratings_long.sort_values('UserID', inplace=True)
synthetic_user_ratings_long
    
# save longform
synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')


In [None]:
def produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    
    print("Producing items for user")
    
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, 0, temp_users_dictionary[item]]
        #overall confidence, this item similarity, item, iteration, degrees away, item name
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(users_comp_dict.keys()) < num_ratings_create:
        
        users_rated_items = list(temp_users_dictionary.keys())
        
        iteration += 1 # advance the iteration
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            print("\nCurrent item: "+str(rated))
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
            print(rated_rating)
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    #print(current_position)
                    break
                                                        
                else:
                    # reset current comp to new position new_items
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in users_comp_dict.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    #print(current_position)
                    break
                                    
                else:
                
                    # reset current comp to new position users_comp_dict
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 21923:
                #print(current_position)
                break
                            
            else:
            
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
                print(current_position)
                print(comp_similarity)
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = rated_rating * comp_similarity
                print(synthetic_rating)
                
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity
                degrees = users_comp_dict[rated][4] + 1

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, degrees, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
        
        continue

    print("End length of rated items is "+str(len(users_comp_dict))+'\n')

    return users_comp_dict

In [None]:
#user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
#user_matrix = user_matrix.T
#user_matrix.index = user_matrix.index.astype('int32')

In [None]:
# run the data synthesizer for each of the 6 ratings matrix files
process_to_synthetic(item, num_ratings_create, desired_ratings, game_ids, '250')

In [None]:
def get_user(user_items, user, game_ids):
    '''
    Takes in user's rated items, a the username, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_items: dataframe column of user's rated items
    user: user to retrieve
    game_ids: the game_ids we are using in our recommender
    
    Outputs:
    overall_user: user dictionary with user's ratings
    '''
    
    # get the mean rating for that user
    user_mean = user_items.mean()
    
    # normalize the ratings for that user by subtracting their mean from all ratings, store in list
    game_ratings_normed =  list(user_items - user_mean)
    
    # Get a list of all of the game IDs that the user rated
    users_game_ids = list(user_items.index)
    
    # get the set of usable game ids
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make user storage dictionary
    user_ratings = {}
    
    # for the key/value pairs of game_ids and normalized ratings
    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value
    
    # make a dictionary to store the intersected ratings
    set_dictionary = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary
    
    return overall_user

In [None]:
def process_matrix_to_synthetic(path, num_ratings_create, desired_ratings, game_ids, number):
    '''
    Process a user matrix and create synthetic data for each user in the matrix
    
    Inputs:
    Path: path appendation for file
    num_ratings_create: The total number of minimum ratings per user
    desired_ratings: the needed number of ratings per user
    '''
    
    # load and transpose data frame
    user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_'+path+'.pkl')
    user_matrix.drop_duplicates(keep='first', inplace=True)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}

    # for each user in the test matrix:
    for user in user_matrix.columns:
   
        print("Starting user "+user)
        
        user_items = user_matrix[user].dropna(axis=0)
        
        # copy the current user dictionary to a temp storage dictionary that we can manipulate
        synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
        temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
        # get the original number of ratings by this user
        original_num_ratings = len(temp_users_dictionary)
        print("User starts with "+str(original_num_ratings)+" ratings")
    
        # call function to produce synthetic ratings
        user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create)
        # call sort function for top synthetic ratings
        sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)
    
    # make dataframe from synthetic sort and melt to longform
    synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
    synthetic_user_ratings.reset_index(inplace=True)
    synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
    synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
    synthetic_user_ratings_long.sort_values('UserID', inplace=True)
    synthetic_user_ratings_long
    
    # save longform
    synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')
    
    # save dictionary
    with open('synthetic_ratings_new_scraper/users_dump_syntheticratings'+path+'_'+number+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    print("Sorting user items")
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'Iteration', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = sorted(list(user_comps_df[:desired_ratings].index))

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_dict[item]
    

## Old style user data

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'Monika1234'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
this_user = pd.DataFrame(user_matrix[user].dropna(axis=0))
this_user.rename(columns={user:'Rating'}, inplace=True)
this_user.reset_index(inplace=True)
this_user['Game'] = this_user['index'].astype('int32').map(game_id_lookup)
#this_user.drop('index', axis=1, inplace=True)
this_user.sort_values('Game', ascending=True).head(30)

In [None]:
game_comps_byid_lookup[298352][0][21923]

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings_all(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(30)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 1.5) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.5, y=(user_mean+.2), s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title(str(desired_ratings)+" Synthetic Ratings for a 10-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=20)
plt.ylabel("Rating", fontsize=20)


plt.tight_layout
#plt.savefig('images/synthetic_from10.png')
plt.show()
;

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_06.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'zusterdoor'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(20)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 2) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.2, y=8.1, s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title("100 Synthetic Ratings for a 5-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=24)
plt.ylabel("Rating", fontsize=24)


plt.tight_layout
#plt.savefig('images/synthetic_from_05.png')
plt.show()
;

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'Szczurek83'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
temp2.to_pickle('scaled_content_filter.pkl')

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(20)

## Notebook Functions

In [None]:
def get_user(user_items, user, game_ids):
    '''
    Takes in user's rated items, a the username, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_items: dataframe column of user's rated items
    user: user to retrieve
    game_ids: the game_ids we are using in our recommender
    
    Outputs:
    overall_user: user dictionary with user's ratings
    '''
    
    # get the mean rating for that user
    user_mean = user_items.mean()
    
    # normalize the ratings for that user by subtracting their mean from all ratings, store in list
    game_ratings_normed =  list(user_items - user_mean)
    
    # Get a list of all of the game IDs that the user rated
    users_game_ids = list(user_items.index)
    
    # get the set of usable game ids
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make user storage dictionary
    user_ratings = {}
    
    # for the key/value pairs of game_ids and normalized ratings
    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value
    
    # make a dictionary to store the intersected ratings
    set_dictionary = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary
    
    return overall_user

In [None]:
def produce_synthetic_ratings_all(user, temp_users_dictionary, num_ratings_create):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, temp_users_dictionary[item]]
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(users_comp_dict.keys()) < num_ratings_create:
        
        users_rated_items = list(temp_users_dictionary.keys())
        
        iteration += 1 # advance the iteration
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            print("Current item: "+str(rated))
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    print(current_position)
                                                        
                else:
                    # reset current comp to new position
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in users_comp_dict.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    print(current_position)
                                    
                else:
                
                    # reset current comp to new position
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 21923:
                print(current_position)
                            
            else:
            
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = rated_rating * comp_similarity
        
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
        
        continue

    print("End length of rated items is "+str(len(users_comp_dict))+'\n')

    return users_comp_dict

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = list(user_comps_df[original_num_ratings:desired_ratings].index)

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_df.loc[item]['SyntheticRating']
    

In [None]:
def process_matrix_to_synthetic(path, num_ratings_create, desired_ratings, game_ids, number):
    '''
    Process a user matrix and create synthetic data for each user in the matrix
    
    Inputs:
    Path: path appendation for file
    num_ratings_create: The total number of minimum ratings per user
    desired_ratings: the needed number of ratings per user
    '''
    
    # load and transpose data frame
    user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_'+path+'.pkl')
    user_matrix.drop_duplicates(keep='first', inplace=True)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}

    # for each user in the test matrix:
    for user in user_matrix.columns:
   
        print("Starting user "+user)
        
        user_items = user_matrix[user].dropna(axis=0)
        
        # copy the current user dictionary to a temp storage dictionary that we can manipulate
        synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
        temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
        # get the original number of ratings by this user
        original_num_ratings = len(temp_users_dictionary)
        print("User starts with "+str(original_num_ratings)+" ratings")
    
        # call function to produce synthetic ratings
        user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create)
        # call sort function for top synthetic ratings
        sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)
    
    # make dataframe from synthetic sort and melt to longform
    synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
    synthetic_user_ratings.reset_index(inplace=True)
    synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
    synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
    synthetic_user_ratings_long.sort_values('UserID', inplace=True)
    synthetic_user_ratings_long
    
    # save longform
    synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')
    
    # save dictionary
    with open('synthetic_ratings_new_scraper/users_dump_syntheticratings'+path+'_'+number+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))

## Deprecated Matrix Calculations

### Using Matrix

In [None]:
larger_matrix = pd.read_pickle('synthetic_ratings/users_synthetic_2193_fullmatrix.pkl')

In [None]:
larger_matrix.head()

In [None]:
users = list(larger_matrix.index)
users[:5]

In [None]:
user_lookup_table = {}

user_key = -1

for user in users:
    
    user_key += 1
    
    user_lookup_table[int(user_key)] = users[user_key]

# save dictionary
with open('user_lookup_table.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_lookup_table))

In [None]:
%time values1 = larger_matrix.loc['Torsten'].values
values1[:10]

In [None]:
%time values2 = larger_matrix.loc['mitnachtKAUBO-I'].values
values2[:10]

In [None]:
%time spatial.distance.cosine(values1,values2)

In [None]:
del similarity_dictionary
gc.collect()

In [None]:
similarity_dictionary = {}

for user in users:
    
    similarity_dictionary[user] = {}

In [None]:
for user in users[:1]:
    
    start = time.time()
    user_values = larger_matrix.loc[user].values
    
    for other_user in users:
        
        if user in similarity_dictionary[other_user]:
            continue
        
        else:
        
            other_user_values = larger_matrix.loc[other_user].values
            similarity = 1 - spatial.distance.cosine(user_values,other_user_values)
            similarity_dictionary[user][other_user] = similarity
            similarity_dictionary[other_user][user] = similarity
    
    end = time.time()
    print(str(end-start)+' seconds')

In [None]:
del larger_matrix
gc.collect()

### Using Numpy Arrays

In [None]:
matrix_array = larger_matrix.to_numpy()

In [None]:
del larger_matrix
gc.collect()

In [None]:
matrix_array.shape

In [None]:
%time values1 = matrix_array[0]
values1[:10]

In [None]:
%time values2 = matrix_array[1]
values2[:10]

In [None]:
%time spatial.distance.cosine(values1,values2)

In [None]:
%time similarities = np.matmul(matrix_array[0:10000], matrix_array[0:10000].T)

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
%time similarities = np.dot(matrix_array[0:10000], matrix_array[0:10000].T)

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
%time similarities = matrix_array[0:10000]@matrix_array[0:10000].T

In [None]:
similarities[:1]

In [None]:
del similarities
gc.collect()

In [None]:
len_users = len(users)

In [None]:
del similarity_dictionary
gc.collect()

In [None]:
similarity_dictionary = {}

for user in np.arange(0, len_users, 1):
    
    similarity_dictionary[user] = {}

In [None]:
len(similarity_dictionary)

In [None]:
similarity_dictionary[0]

In [None]:
for user in np.arange(0, len_users, 1)[:1]:
    
    start = time.time()
    user_values = matrix_array[user].reshape(-1,1)
    
    other_matrix = matrix_array[user+1:]
    
    similarities = cosine_similarity(other_matrix, user_values)
    
    end = time.time()
    print(str(end-start)+' seconds')    

In [None]:
similarities.shape

In [None]:
for user in np.arange(0, len_users, 1)[:3]:
    
    start = time.time()
    user_values = matrix_array[user]
    
    for other_user in np.arange(0, len_users, 1):
        
        if user in similarity_dictionary[other_user]:
            continue
        
        else:
        
            other_user_values = matrix_array[other_user]
            similarity = 1 - spatial.distance.cosine(user_values,other_user_values)
            similarity_dictionary[user][other_user] = similarity
            similarity_dictionary[other_user][user] = similarity
    
    end = time.time()
    print(str(end-start)+' seconds')

In [None]:
similarity_dictionary[0]

In [None]:
similarity_dictionary[5]

In [None]:
del matrix_array
gc.collect()

In [None]:
#larger_matrix_T = pd.read_pickle('synthetic_ratings/users_synthetic_2193_fullmatrixT.pkl')

## Different ways to make calculations

In [None]:
matrix_sparsed = pd.read_pickle('synthetic_ratings/users_synthetic_2193_sparsematrix.pkl')

In [None]:
matrix_sparsed.info()

In [None]:
matrix_sparsed.head()

In [None]:
users = list(matrix_sparsed.index)
users[:5]

### Chunks, sparse non-normalized

In [None]:
%time sparse_matrix = csr_matrix(matrix_sparsed.sparse.to_coo())

In [None]:
del matrix_sparsed
gc.collect()

In [None]:
type(sparse_matrix)

In [None]:
sparse_matrix

In [None]:
sparse_matrix.shape[0]

In [None]:
%time similarities = cosine_similarity(sparse_matrix[0:10000], sparse_matrix[0:10000], dense_output=True)

In [None]:
similarities[0]

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster 
chunk_size = 10000 
matrix_len = sparse_matrix.shape[0] 

def similarity_cosine_by_chunk(start, end, dense):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(X=sparse_matrix[start:end], Y=sparse_matrix, dense_output=dense) # scikit-learn function

#for chunk_start in range(0, 10, chunk_size):
    #cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
%time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 10000, dense=True)

- Time for size 1, dense output: 39.4s
- Time for size 1000, dense output: 8min 48s
- Time for size 1, compact output: 47.8s
- Time for size 10000, compact output: 1h 41min 6s

In [None]:
cosine_similarity_chunk[:1]

In [None]:
sparse_matrix.shape

In [None]:
sparse_matrix[0:10000].shape

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster 
chunk_size = 10000 
matrix_len = sparse_matrix.shape[0] 

def similarity_cosine_by_chunk(start, end, dense):
    if end > matrix_len:
        end = matrix_len
    return np.matmul(sparse_matrix[start:end], sparse_matrix) # scikit-learn function

#for chunk_start in range(0, 10, chunk_size):
    #cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
#%time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 10000, dense=True)


In [None]:
similarities[:1]

### Chunks, normalized

In [None]:
normed_matrix = pp.normalize(sparse_matrix.tocsc(), axis=0)
del sparse_matrix
gc.collect()

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster 
chunk_size = 1000 
matrix_len = normed_matrix.shape[0] 

def similarity_cosine_by_chunk(start, end, dense=False):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(X=normed_matrix[start:end], Y=normed_matrix, dense_output=dense) # scikit-learn function

#for chunk_start in range(0, 10, chunk_size):
    #cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
%time cosine_similarity_chunk = similarity_cosine_by_chunk(0, 1, dense=False)

Time for size 1, dense output: 1min 51s
Time for size 1000, dense output: 10min 20s
Time for size 1, compact output: 1min 51s


In [None]:
cosine_similarity_chunk.shape

In [None]:
cosine_similarity_chunk[:1]

In [None]:
user = 'Torsten'

%time sparse_user =  csr_matrix(matrix_sparsed.loc[user])

In [None]:
sparse_user.T

In [None]:
sparseuser_AB = sparse_matrix.multiply(sparse_user)

In [None]:
sparseuser_AB

### Old function with comparison blocks

In [None]:
for user_block in block_indices_lookup:
    
    print("Starting block "+str(user_block))
    
    starting_block_indexes = block_indices_lookup[user_block]
    base_start = starting_block_indexes['Start']
    base_end = starting_block_indexes['End']
    
    array_chunk_a = (matrix_array[base_start:base_end]/10).astype('float32')
    
    # Opening JSON file
    with open('user_similarities/similarity_storage'+str(user_block)+'.json') as json_file:
        base_users_storage = json.load(json_file)
    
    first_block_of_comparison = user_block
    end_range = len(block_indices_lookup)+1    
    
    # TEMPORARY END RANGE FOR TESTINGS
    end_range = 2
    
    for comparison_block in np.arange(first_block_of_comparison, end_range, 1):
        
        print("User Block "+str(user_block)+' vs Comparison Block '+str(comparison_block))

        # Opening JSON file
        with open('user_similarities/similarity_storage'+str(comparison_block)+'.json') as json_file:
            comparison_users_storage = json.load(json_file)
        
        comparison_indexes = block_indices_lookup[comparison_block]
        compare_start = comparison_indexes['Start']
        compare_end = comparison_indexes['End']
        
        print("Making matrices")
        start = time.time()
        array_chunk_b = ((matrix_array[compare_start:compare_end].T)/10).astype('float32')
        
        a = tf.constant(array_chunk_a)
        b = tf.constant(array_chunk_b)
        
        normalize_a = tf.nn.l2_normalize(a,1)
        del a
        gc.collect()

        normalize_b = tf.nn.l2_normalize(b,0)
        del b
        gc.collect()
        
        print("Getting similarity scores")
        similarities = tf.matmul(normalize_a, normalize_b)#, adjoint_b=True)
        del normalize_a
        del normalize_b
        gc.collect()
        
        # store user info
        
        incrementer_base = 0
        
        print("Storing Similarities")
        for base_user in user_blocks_lookup[user_block][:5]:
            
            print(base_user)
                                   
            user_similarities = similarities[incrementer_base].numpy()
            max_spot = np.argmax(user_similarities.max())
            mean_spot = np.median(user_similarities)
            user_similarities[max_spot] = mean_spot
            scaler = MinMaxScaler(feature_range=(-1,1))
            user_similarities = scaler.fit_transform(user_similarities.reshape(-1,1))
            user_similarities = list(np.round(user_similarities, 2).ravel())
            
            for key, value in list(zip(user_blocks_lookup[comparison_block][incrementer_base:], user_similarities[incrementer_base:])):
                if value >= .25 or value <= -.25:
                    base_users_storage[base_user][key] = float(value)
                if user_block != comparison_block:
                    comparison_users_storage[key][base_user] = float(value)
            
            incrementer_base +=1
        
            # save dictionary
            with open('user_similarities/similarity_storage'+str(comparison_block)+'.json', 'w') as convert_file:
                convert_file.write(json.dumps(comparison_users_storage))
        
        print("Cleaning up memory for this iteration")
        del comparison_users_storage
        #del similarities
        gc.collect()
        
        end = time.time()
        print(str(end-start)+' seconds elapsed for this comparison section')
    
    # save dictionary
    with open('user_similarities/similarity_storage'+str(user_block)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(base_users_storage))
        
    #del base_users_storage
    gc.collect()

## Deprecated Tensorflow time reduction attempts

In [None]:
# the basic file required for this work - the full matrix

larger_matrix = pd.read_pickle('synthetic_ratings/users_synthetic_2193_sparsematrix_nogameids.pkl')

In [None]:
larger_matrix.info()

In [None]:
larger_matrix.head()

In [None]:
# Make sparse dataframe into numpy array

matrix_array = np.array(larger_matrix)

Turn single user into a column 21921, 1

In [None]:
user_id = 3

In [None]:
# Get single user from matrix_array

%time single_user = matrix_array[user_id]
single_user.shape

In [None]:
# Get nonzero indices for user
%time indices = list(np.nonzero(single_user)[0])
indices

In [None]:
# make reduced array for user of nonzero indices
%time array_chunk_a = (single_user[indices]).astype('float32').reshape(-1,1)
array_chunk_a.shape

In [None]:
# normalize user
%time normalize_a = normalize(array_chunk_a, axis=0)
normalize_a

Investigate methods of reducing dataframe or array

In [None]:
# make reduced on sparse dataframe
%time df_chunk_b = larger_matrix[indices]

In [None]:
df_chunk_b.shape

In [None]:
df_chunk_b.info()

In [None]:
# make reduced on array
%time array_chunk_b = matrix_array[:, indices]

In [None]:
array_chunk_b.shape

In [None]:
# turn array into sparse matrix
sparse_matrix = sparse.csr_matrix(matrix_array)

In [None]:
# make reduced on sparse
%time array_chunk_b = sparse_matrix[:, indices]

Convert dataframe to array

In [None]:
# convert reduced dataframe to sparse matrix

%time sparse_array = sparse.csr_matrix(df_chunk_b.sparse.to_coo())

In [None]:
# convert reduced dataframe to array
%time array_b_matrix = df_chunk_b.to_numpy()

In [None]:
array_b_matrix[0][:10]

Investigate normalization methods

In [None]:
# sklearn normalize on dataframe
%time normalize_b = normalize(df_chunk_b, axis=1)

In [None]:
normalize_b[0]

In [None]:
# sklearn normalize on array
%time normalize_b = normalize(array_b_matrix, axis=1)

In [None]:
normalize_b[0][:10]

In [None]:
# make partial dataframe segment
%time partial_df = df_chunk_b[:134400]

In [None]:
# make partial array segment
%time partial_array = normalize_b[:134400]