# Notebook Objective and Setup

BGG06 is where synthetic ratings are produced for each user, using the content-based item filter from BGG05.

## Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

#from scipy import sparse
#from scipy.sparse import csr_matrix
#from scipy import spatial

#from sklearn.metrics.pairwise import cosine_similarity
#import sklearn.preprocessing as pp
from sklearn.preprocessing import MinMaxScaler#, OneHotEncoder, StandardScaler, PolynomialFeatures, 

# visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import umap
import umap.plot

In [None]:
import tensorflow as tf
from tensorflow.compat.v1.losses import cosine_distance

## Notebook Functions

In [None]:
def produce_synthetic_ratings_all(user, num_ratings_create, game_ids):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    
    start = time.time()
    
    #print("Producing items for user")
    
    user_items = user_ratings[user]
    user_mean = user_means[user]
    
    temp_users_dictionary = {}
    
    # copy the current user dictionary to a temp storage dictionary that we can manipulate

    for item in user_ratings[user]:
        this_rating = round((user_ratings[user][item]-user_mean), 1)
        temp_users_dictionary[int(item)] = this_rating
        synthetic_users_dictionary[user][int(item)] = int(this_rating*10)
        
    
    # get the original number of ratings by this user
    original_num_ratings = len(temp_users_dictionary)
    
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, 0, temp_users_dictionary[item]]
        #overall confidence, this item similarity, item, iteration, degrees away, item name
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(temp_users_dictionary.keys()) < num_ratings_create:
        
        start_set_length = len(temp_users_dictionary.keys())
        
        users_rated_items = list(temp_users_dictionary.keys())
        #print(len(users_rated_items))
        
        iteration += 1 # advance the iteration
        
        #print("Starting iteration "+str(iteration))
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            #print("Current item: "+str(rated))
            
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 10000:
                    #print(current_position)
                    break
                                                        
                else:
                    # reset current comp to new position new_items
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in temp_users_dictionary.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 10000:
                    #print(current_position)
                    break
                                    
                else:
                
                    # reset current comp to new position users_comp_dict
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 10000:
                #print(current_position)
                break
                            
            else:
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
                
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = round((rated_rating * comp_similarity), 1)
        
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity
                degrees = users_comp_dict[rated][4] + 1

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, degrees, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
                
                # add to synthetic users
                synthetic_users_dictionary[user][current_comp] = int(synthetic_rating*10)
               
        end_set_length = len(temp_users_dictionary.keys())
            
        if start_set_length == end_set_length:
            
            break
        
        continue
       
    end = time.time()
    #print(str(end-start)+' seconds for user.\n')
    
    return users_comp_dict, temp_users_dictionary

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    
    # Use this one when you want only exactly x ratings and don't want to necessarily keep everything produced
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = list(user_comps_df[original_num_ratings:desired_ratings].index)

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_df.loc[item]['SyntheticRating']
    

In [None]:
def populate_all_ratings(user, synthetic_users_dictionary, temp_users_dictionary):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''   
    
    
    
    
    
    not_rated = list(set(game_ids) - set(temp_users_dictionary.keys()))
    print(str(len(not_rated))+" games were not rated")
            
    for item in not_rated:
        temp_users_dictionary[item] = 0
        users_comp_dict[item] = [0, 0, 0, iteration, 0, 0]
    
    print("End length of rated items is "+str(len(temp_users_dictionary)))
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = sorted(list(temp_users_dictionary.keys()))

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = temp_users_dictionary[item]
    

## Required Data Load

In [None]:
# read games for game_ids
games = pd.read_pickle('data_cleaned/games.pkl')
game_ids = list(games['BGGId'])

In [None]:
# Read cosine similarity pickle
sims_byid = pd.read_pickle('data_cleaned/game_cosine_similarity_byid.pkl')

In [None]:
# Opening JSON file
with open('data_cleaned/user_means.json') as json_file:
    user_means = json.load(json_file)

In [None]:
# Opening JSON file
with open('real_ratings/user_ratings_unscaled.json') as json_file:
    user_ratings = json.load(json_file)

all_users = list(user_ratings.keys())

user_block_1 = all_users[:40000]
user_block_2 = all_users[40000:80000]
user_block_3 = all_users[80000:120000]
user_block_4 = all_users[120000:160000]
user_block_5 = all_users[160000:200000]
user_block_6 = all_users[200000:240000]
user_block_7 = all_users[240000:]

user_blocks = [user_block_1, user_block_2, user_block_3, user_block_4, user_block_5, user_block_6, user_block_7]

del user_ratings
gc.collect()

In [None]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle('data_cleaned/games.pkl')

# lists of game ids and game names
game_ids = list(games['BGGId'])
game_names = list(games['Name'])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item

    
del games
gc.collect()

In [None]:
len(game_id_lookup)

In [None]:
# get top 1000 most similar games for each game and store in dictionary

game_comps_byid_lookup = {}

for item in sims_byid.columns:
    results = pd.DataFrame(data={'Similarity': sims_byid[item].sort_values(ascending=False)[1:]})
    current_cap = results['Similarity'].max()
    comps_index = list(results[:10000].index.astype('int32'))
    comps_similarity = list(results[:10000]['Similarity'])
    game_comps_byid_lookup[item] = [comps_index, comps_similarity]

In [None]:
del sims_byid
gc.collect()

# Produce Synthetic Ratings

## Test One User

In [None]:
with open('real_ratings/user_ratings_block_unscaled_2.json') as json_file:
    user_ratings = json.load(json_file)

In [None]:
user_ratings['Threnody']

In [None]:
user = 'Threnody'
user_mean = user_means[user]

In [None]:
this_user = pd.DataFrame(user_ratings[user].values(), index=user_ratings[user].keys())
this_user.reset_index(inplace=True)
this_user.rename(columns={0:'Rating', 'index':'BGGId'}, inplace=True)
this_user['Game'] = this_user['BGGId'].astype('int32').map(game_id_lookup)
this_user.sort_values('Game', ascending=True).head(30)

In [None]:
# number of synthetic ratings to produce
num_ratings_create = 2500

# number of ratings we will end up using
desired_ratings = 2500

In [None]:
del synthetic_users_dictionary
gc.collect()

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}
synthetic_users_dictionary[user] = {}

In [None]:
print("Starting user "+user)

# call function to produce synthetic ratings
user_comps_dict, temp_users_dictionary  = produce_synthetic_ratings_all(user, num_ratings_create, game_ids) 

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = (temp2[0]/10)+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'Iteration', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')

user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(30)

In [None]:
user_comps_df.info()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 1.5) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.5, y=(user_mean+.2), s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title(str(desired_ratings)+" Synthetic Ratings for a 10-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=20)
plt.ylabel("Rating", fontsize=20)


plt.tight_layout
#plt.savefig('images/synthetic_from10.png')
plt.show()
;

In [None]:
del synthetic_users_dictionary
del user_comps_df
del temp_users_dictionary
del this_user
del user_ratings
del user_comps_dict

gc.collect()

## Process ALL Users

In [None]:
# number of synthetic ratings to produce
num_ratings_create = 2000

# number of ratings we will end up using
desired_ratings = 2000

open_block = 'real_ratings/user_ratings_block_unscaled_' # base file to open and synthesize ratings
save_block = 'synthetic_ratings/users_synthetic_'+str(desired_ratings)+'_' # save path for synthesized
matrix_save = 'synthetic_ratings/users_synthetic_'+str(desired_ratings)+'_fullmatrix.pkl' # save path for full matrix

In [None]:
block_marker = 0

for block in user_blocks:
    
    print(block)
    block_marker +=1
    
    # Opening JSON file
    with open(open_block+str(block_marker)+'.json') as json_file:
        user_ratings = json.load(json_file)
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}
    
    user_count = 0
    
    for user in block:
        #print(user)
        user_count+=1
        
        synthetic_users_dictionary[user] = {}
   
        #print("Starting user "+str(user_count))
               
        # call function to produce synthetic ratings
        user_comps_dict, temp_users_dictionary = produce_synthetic_ratings_all(user, num_ratings_create, game_ids) 
    
        #sort_synthetic_ratings(user, synthetic_users_dictionary, temp_users_dictionary)
    
        del user_comps_dict
        del temp_users_dictionary
        #gc.collect()

    # save dictionary
    with open(save_block+str(block_marker)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))
    
    del synthetic_users_dictionary
    gc.collect()

In [None]:
#del user_ratings
#del game_comps_byid_lookup

#gc.collect()

# Produce Matrices

In [None]:
larger_matrix = pd.DataFrame()

In [None]:
for append in range(1, 8):
    
    print("Opening file "+save_block+str(append))
    with open(save_block+str(append)+'.json') as json_file:
        set_of_ratings = json.load(json_file)
        
    print("Converting file to DF")
    matrix = pd.DataFrame(set_of_ratings).T

    print("Clearing memory")
    del set_of_ratings
    gc.collect()

    print("Filling NaN")
    matrix.fillna(0, inplace=True)
    
    print("Converting to Int8")
    matrix = matrix.astype('int8') 
    
    #print("Converting to sparse")
    #matrix_sparsed = matrix.astype(pd.SparseDtype("float32"))
    
    print("Adding to larger DF")
    larger_matrix = larger_matrix.append(matrix)
    
    del matrix
    gc.collect()
           
    print(larger_matrix.shape)

In [None]:
larger_matrix.fillna(0, inplace=True)
larger_matrix = larger_matrix.astype('int8')
#larger_matrix = larger_matrix.astype(pd.SparseDtype("float32"))

In [None]:
larger_matrix.info()

In [None]:
larger_matrix.to_pickle(matrix_save)

In [None]:
larger_matrix.head()

In [None]:
del larger_matrix
gc.collect()

## Data Validation

In [None]:
with open('synthetic_ratings/users_synthetic_1000_2.json') as json_file:
    user_ratings = json.load(json_file)
user_ratings['Threnody']

In [None]:
len(user_ratings['Threnody'])

In [None]:
with open('synthetic_ratings/users_synthetic_100_2.json') as json_file:
    user_ratings = json.load(json_file)
user_ratings['Threnody']

In [None]:
len(user_ratings['Threnody'])

In [None]:
with open('real_ratings/user_ratings_block_unscaled_2.json') as json_file:
    user_ratings = json.load(json_file)
user_ratings['Threnody']

In [None]:
len(user_ratings['Threnody'])

In [None]:
with open('real_ratings/user_ratings_block_scaled_2.json') as json_file:
    user_ratings = json.load(json_file)
user_ratings['Threnody']

In [None]:
len(user_ratings['Threnody'])

In [None]:
del user_ratings
gc.collect()

In [None]:
validation_1 = pd.read_pickle('synthetic_ratings/users_synthetic_1000_fullmatrix.pkl')
validation_1.head()

In [None]:
validation_1 = pd.read_pickle('synthetic_ratings/users_synthetic_500_fullmatrix.pkl')
validation_1.head()

In [None]:
validation_1 = pd.read_pickle('synthetic_ratings/users_synthetic_100_fullmatrix.pkl')
validation_1.head()

In [None]:
validation_1 = pd.read_pickle('real_ratings/users_real_scaled_fullmatrix.pkl')
validation_1.head()

In [None]:
validation_1 = pd.read_pickle('real_ratings/users_real_unscaled_fullmatrix.pkl')
validation_1.head()

In [None]:
del validation_1
gc.collect()

# Make User Means Dict

In [None]:
# Opening JSON file
with open('real_ratings/user_ratings_unscaled.json') as json_file:
    user_ratings = json.load(json_file)

In [None]:
len(user_ratings)

In [None]:
user_means = {}

In [None]:
for person in user_ratings:
    user_items = []
    for item in user_ratings[person]:
        user_items.append(user_ratings[person][item])
    user_mean = round((mean(user_items)), 1)
    user_means[person] = user_mean

In [None]:
user_means['Threnody']

In [None]:
user_means['moosh21']

In [None]:
user_means['Shade92008']

In [None]:
user_means['Torsten']

In [None]:
# save dictionary
with open('data_cleaned/user_means.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_means))

In [None]:
del user_means
gc.collect()

In [None]:
# Opening JSON file
with open('data_cleaned/user_means.json') as json_file:
    user_means_dict = json.load(json_file)

In [None]:
user_means = pd.DataFrame.from_dict(user_means_dict, orient='index')
user_means.rename(columns={0:'Mean'}, inplace=True)
user_means.head()

In [None]:
user_means.to_pickle('data_cleaned/user_means.pkl')

# Make Ratings Block Sets

## Make scaled ratings

In [None]:
user_ratings_scaled = {}

for person in user_ratings:
    user_ratings_scaled[person] = {}
    user_mean = mean(user_ratings[person].values())
    for item in user_ratings[person]:
        new_value = int(round((user_ratings[person][item] - user_mean), 1)*10)
        user_ratings_scaled[person][item] = new_value

In [None]:
# save dictionary
with open('real_ratings/real_user_ratings_scaled.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_ratings_scaled))

In [None]:
user_ratings_scaled['Threnody']

## Make smaller ratings blocks

In [None]:
# Opening JSON file
with open('real_ratings/user_ratings_unscaled.json') as json_file:
    user_ratings = json.load(json_file)

In [None]:
all_users = list(user_ratings.keys())

In [None]:
len(all_users)

In [None]:
user_block_1 = all_users[:40000]
user_block_2 = all_users[40000:80000]
user_block_3 = all_users[80000:120000]
user_block_4 = all_users[120000:160000]
user_block_5 = all_users[160000:200000]
user_block_6 = all_users[200000:240000]
user_block_7 = all_users[240000:]

user_blocks = [user_block_1, user_block_2, user_block_3, user_block_4, user_block_5, user_block_6, user_block_7]

In [None]:
iteration = 0

for block in user_blocks:
    
    iteration += 1
    
    print("Starting block "+str(iteration))
    
    block_of_users = {key: value for key, value in user_ratings.items() if key in block}
    
    #for scaled only:
    for person in block_of_users:
        #user_mean = mean(block_of_users[person].values())
        for item in block_of_users[person]:
            #new_value = round((block_of_users[person][item] - user_mean), 2)
            new_value = block_of_users[person][item]
            block_of_users[person][item] = new_value
    
    # save dictionary
    with open('real_ratings/user_ratings_block_unscaled_'+str(iteration)+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(block_of_users))
        
    del block_of_users
    gc.collect()

In [None]:
del user_blocks
del user_ratings
gc.collect()

# Deprecated

In [None]:
# make dataframe from synthetic sort and melt to longform
synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean
    
    
synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synthetic_user_ratings_long.sort_values('UserID', inplace=True)
synthetic_user_ratings_long
    
# save longform
synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')


In [None]:
def produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    
    print("Producing items for user")
    
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, 0, temp_users_dictionary[item]]
        #overall confidence, this item similarity, item, iteration, degrees away, item name
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(users_comp_dict.keys()) < num_ratings_create:
        
        users_rated_items = list(temp_users_dictionary.keys())
        
        iteration += 1 # advance the iteration
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            print("\nCurrent item: "+str(rated))
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
            print(rated_rating)
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    #print(current_position)
                    break
                                                        
                else:
                    # reset current comp to new position new_items
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in users_comp_dict.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    #print(current_position)
                    break
                                    
                else:
                
                    # reset current comp to new position users_comp_dict
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 21923:
                #print(current_position)
                break
                            
            else:
            
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
                print(current_position)
                print(comp_similarity)
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = rated_rating * comp_similarity
                print(synthetic_rating)
                
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity
                degrees = users_comp_dict[rated][4] + 1

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, degrees, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
        
        continue

    print("End length of rated items is "+str(len(users_comp_dict))+'\n')

    return users_comp_dict

In [None]:
#user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
#user_matrix = user_matrix.T
#user_matrix.index = user_matrix.index.astype('int32')

In [None]:
# run the data synthesizer for each of the 6 ratings matrix files
process_to_synthetic(item, num_ratings_create, desired_ratings, game_ids, '250')

In [None]:
def get_user(user_items, user, game_ids):
    '''
    Takes in user's rated items, a the username, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_items: dataframe column of user's rated items
    user: user to retrieve
    game_ids: the game_ids we are using in our recommender
    
    Outputs:
    overall_user: user dictionary with user's ratings
    '''
    
    # get the mean rating for that user
    user_mean = user_items.mean()
    
    # normalize the ratings for that user by subtracting their mean from all ratings, store in list
    game_ratings_normed =  list(user_items - user_mean)
    
    # Get a list of all of the game IDs that the user rated
    users_game_ids = list(user_items.index)
    
    # get the set of usable game ids
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make user storage dictionary
    user_ratings = {}
    
    # for the key/value pairs of game_ids and normalized ratings
    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value
    
    # make a dictionary to store the intersected ratings
    set_dictionary = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary
    
    return overall_user

In [None]:
def process_matrix_to_synthetic(path, num_ratings_create, desired_ratings, game_ids, number):
    '''
    Process a user matrix and create synthetic data for each user in the matrix
    
    Inputs:
    Path: path appendation for file
    num_ratings_create: The total number of minimum ratings per user
    desired_ratings: the needed number of ratings per user
    '''
    
    # load and transpose data frame
    user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_'+path+'.pkl')
    user_matrix.drop_duplicates(keep='first', inplace=True)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}

    # for each user in the test matrix:
    for user in user_matrix.columns:
   
        print("Starting user "+user)
        
        user_items = user_matrix[user].dropna(axis=0)
        
        # copy the current user dictionary to a temp storage dictionary that we can manipulate
        synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
        temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
        # get the original number of ratings by this user
        original_num_ratings = len(temp_users_dictionary)
        print("User starts with "+str(original_num_ratings)+" ratings")
    
        # call function to produce synthetic ratings
        user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create)
        # call sort function for top synthetic ratings
        sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)
    
    # make dataframe from synthetic sort and melt to longform
    synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
    synthetic_user_ratings.reset_index(inplace=True)
    synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
    synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
    synthetic_user_ratings_long.sort_values('UserID', inplace=True)
    synthetic_user_ratings_long
    
    # save longform
    synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')
    
    # save dictionary
    with open('synthetic_ratings_new_scraper/users_dump_syntheticratings'+path+'_'+number+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    print("Sorting user items")
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'Iteration', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = sorted(list(user_comps_df[:desired_ratings].index))

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_dict[item]
    

## Old style user data

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'Monika1234'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
this_user = pd.DataFrame(user_matrix[user].dropna(axis=0))
this_user.rename(columns={user:'Rating'}, inplace=True)
this_user.reset_index(inplace=True)
this_user['Game'] = this_user['index'].astype('int32').map(game_id_lookup)
#this_user.drop('index', axis=1, inplace=True)
this_user.sort_values('Game', ascending=True).head(30)

In [None]:
game_comps_byid_lookup[298352][0][21923]

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings_all(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(30)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 1.5) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.5, y=(user_mean+.2), s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title(str(desired_ratings)+" Synthetic Ratings for a 10-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=20)
plt.ylabel("Rating", fontsize=20)


plt.tight_layout
#plt.savefig('images/synthetic_from10.png')
plt.show()
;

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_06.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'zusterdoor'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(20)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

sns.set(font_scale = 2) # set our font scale bigger for this vis

# scatter our data
sns.set_style('darkgrid')
scatter2 = sns.scatterplot(x="DegreesAway", y='SyntheticRating', data=user_comps_df, 
                           hue='DegreesAway', palette='viridis', s=100)
ax.axhline(user_mean)
ax.text(x=.2, y=8.1, s='User Mean '+str(user_mean), alpha=0.7, color='black')

ax.get_legend().remove()

plt.title("100 Synthetic Ratings for a 5-Rating User", fontsize=30)
plt.xlabel("Steps Away from True Rating", fontsize=24)
plt.ylabel("Rating", fontsize=24)


plt.tight_layout
#plt.savefig('images/synthetic_from_05.png')
plt.show()
;

### Test One User

In [None]:
user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_03.pkl')
user_matrix = user_matrix.T
user_matrix.index = user_matrix.index.astype('int32')

In [None]:
user = 'Szczurek83'
user_mean = users_means[user]

In [None]:
user_items = user_matrix[user].dropna(axis=0)
user_items

In [None]:
# set up a synthetic ratings dictionary to store the users and ratings
synthetic_users_dictionary = {}

temp_users_dictionary = {}
    
print("Starting user "+user)

user_items = user_matrix[user].dropna(axis=0)

# copy the current user dictionary to a temp storage dictionary that we can manipulate
synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
# get the original number of ratings by this user
original_num_ratings = len(temp_users_dictionary)

    
# call function to produce synthetic ratings
user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create) 
    
sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)

synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary)
synthetic_user_ratings.reset_index(inplace=True)
synthetic_user_ratings.rename(columns={'index':'BGGId', user:'Rating'}, inplace=True)
synthetic_user_ratings['Rating'] = synthetic_user_ratings['Rating']+user_mean

In [None]:
temp2 = pd.DataFrame(synthetic_users_dictionary[user].values(), index=synthetic_users_dictionary[user].keys())
temp2['Game'] = temp2.index.map(game_id_lookup)
temp2['Rating'] = temp2[0]+user_mean
temp2.reset_index(inplace=True)
temp2.drop(['index', 0], axis=1, inplace=True)
temp2.sort_values('Rating', ascending=False).head(100)

In [None]:
temp2.to_pickle('scaled_content_filter.pkl')

In [None]:
user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
user_comps_df['SyntheticRating'] = user_comps_df['SyntheticRating']+user_mean
user_comps_df['RecommendedItem'] = user_comps_df.index.map(game_id_lookup)
user_comps_df['Seed'] = user_comps_df['RecFrom'].map(game_id_lookup)
user_comps_df.sort_values('SyntheticRating', ascending=False).head(20)

## Notebook Functions

In [None]:
def get_user(user_items, user, game_ids):
    '''
    Takes in user's rated items, a the username, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_items: dataframe column of user's rated items
    user: user to retrieve
    game_ids: the game_ids we are using in our recommender
    
    Outputs:
    overall_user: user dictionary with user's ratings
    '''
    
    # get the mean rating for that user
    user_mean = user_items.mean()
    
    # normalize the ratings for that user by subtracting their mean from all ratings, store in list
    game_ratings_normed =  list(user_items - user_mean)
    
    # Get a list of all of the game IDs that the user rated
    users_game_ids = list(user_items.index)
    
    # get the set of usable game ids
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make user storage dictionary
    user_ratings = {}
    
    # for the key/value pairs of game_ids and normalized ratings
    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value
    
    # make a dictionary to store the intersected ratings
    set_dictionary = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary
    
    return overall_user

In [None]:
def produce_synthetic_ratings_all(user, temp_users_dictionary, num_ratings_create):
    '''
    Takes in a dictionary of user's ratings and the number of ratings to synthesize
    Synthesizes ratings and creates a dictionary of all synthesized ratings for the user
    Returns synthesized ratings
    
    Inputs:
    user: the user id to create ratings for
    temp_users_dictionary: dictionary of specific user's real ratings
    num_ratings_create : simple number. # Ratings to make in the run.
    
    Outputs:
    user_comps_dict : dictionary of synthesized ratings specifically for user
    '''
    # start at iteration 0
    iteration = 0
    
    # set up dict to store all specific comps for this user
    users_comp_dict = {}

    # populate the comps with the user's baseline items
    for item in temp_users_dictionary:  
        users_comp_dict[item] = [1, 1, item, 0, temp_users_dictionary[item]]
       
    # while the list of items that the user rated is < the number of ratings needed:
    while len(users_comp_dict.keys()) < num_ratings_create:
        
        users_rated_items = list(temp_users_dictionary.keys())
        
        iteration += 1 # advance the iteration
        
        new_items = [] # make a list to hold the items for this iteration        
        
        # for each rated item:
        for rated in users_rated_items:
            
            print("Current item: "+str(rated))
            # get rating for current item
            rated_rating = temp_users_dictionary[rated]
        
            # get current best comp:
            current_position = 0
            current_comp = game_comps_byid_lookup[rated][0][current_position]
            
            while current_comp in new_items:
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    print(current_position)
                                                        
                else:
                    # reset current comp to new position
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # any time the current comp is in users_rated_items already:
            while current_comp in users_comp_dict.keys():
                
                # increment position
                current_position+=1 
                
                if current_position >= 21923:
                    print(current_position)
                                    
                else:
                
                    # reset current comp to new position
                    current_comp = game_comps_byid_lookup[rated][0][current_position]

                    # continue back to check
                    continue
            
            # The next section activates once the current comp is not already in the user's rated items
            
            if current_position >= 21923:
                print(current_position)
                            
            else:
            
            
                # getting similarity of the current comp
                comp_similarity = game_comps_byid_lookup[rated][1][current_position]
              
                # get the synthetic rating for the item by taking the rating of the base item * similarity
                synthetic_rating = rated_rating * comp_similarity
        
                # get the overall confidence of this rating 
                # confidence = confidence of prior item * similarity of current item
                confidence = users_comp_dict[rated][0] * comp_similarity

                # add this item to the list of new items we are adding to the ratings this round
                new_items.append(current_comp)
            
                # make the user's comp dict
                users_comp_dict[current_comp] = [confidence, comp_similarity, rated, iteration, synthetic_rating]
            
                # update the temporary dictionary with the synthetic rating for the item
                temp_users_dictionary[current_comp] = synthetic_rating
        
        continue

    print("End length of rated items is "+str(len(users_comp_dict))+'\n')

    return users_comp_dict

In [None]:
def sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings):
    '''
    Takes the user's synthesized comps dict, the original number of ratings the user made, 
    and the desired number of ratings the user needs.
    Creates a df sorting the synthesized ratings by confidence level, 
    keeping the highest confidence if an item was recommended more than once.
    Evaluates number of ratings needed to reach 500 and keeps only that many ratings with the highest confidence.
    For each item kept, logs the synthetic rating to the user;s dictionary
    
    Inputs:
    user: specific user to sort
    synthetic_users_dictionary: reference to the dictionary of synthesized items
    user_comps_dict: dictionary of synthesized ratings specifically for user
    original_num_ratings: The number of ratings the user actually rated
    desired_ratings: the number of ratings needed by the user
    
    '''
    
    # showing synthetic ratings only
    user_comps_df = pd.DataFrame(user_comps_dict.values(), index=user_comps_dict.keys(), columns=['OverallConfidence', 'SimtoLast', 'RecFrom', 'DegreesAway', 'SyntheticRating']).sort_values('OverallConfidence', ascending=False).drop_duplicates(keep='first')
    
    # get a list of the ratings to keep (past the real ratings)
    keep_items = list(user_comps_df[original_num_ratings:desired_ratings].index)

    # for each item that we keep,
    for item in keep_items:
    
        # add the rating to the real storage dictionary
        synthetic_users_dictionary[user][item] = user_comps_df.loc[item]['SyntheticRating']
    

In [None]:
def process_matrix_to_synthetic(path, num_ratings_create, desired_ratings, game_ids, number):
    '''
    Process a user matrix and create synthetic data for each user in the matrix
    
    Inputs:
    Path: path appendation for file
    num_ratings_create: The total number of minimum ratings per user
    desired_ratings: the needed number of ratings per user
    '''
    
    # load and transpose data frame
    user_matrix = pd.read_pickle('data_cleaned/ratings_matrix_cleaned_'+path+'.pkl')
    user_matrix.drop_duplicates(keep='first', inplace=True)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    
    # set up a synthetic ratings dictionary to store the users and ratings
    synthetic_users_dictionary = {}

    # for each user in the test matrix:
    for user in user_matrix.columns:
   
        print("Starting user "+user)
        
        user_items = user_matrix[user].dropna(axis=0)
        
        # copy the current user dictionary to a temp storage dictionary that we can manipulate
        synthetic_users_dictionary[user] = get_user(user_items, user, game_ids)
        temp_users_dictionary = copy.deepcopy(synthetic_users_dictionary[user])
    
        # get the original number of ratings by this user
        original_num_ratings = len(temp_users_dictionary)
        print("User starts with "+str(original_num_ratings)+" ratings")
    
        # call function to produce synthetic ratings
        user_comps_dict = produce_synthetic_ratings(user, temp_users_dictionary, num_ratings_create)
        # call sort function for top synthetic ratings
        sort_synthetic_ratings(user, synthetic_users_dictionary, user_comps_dict, original_num_ratings, desired_ratings)
    
    # make dataframe from synthetic sort and melt to longform
    synthetic_user_ratings = pd.DataFrame.from_dict(synthetic_users_dictionary).T
    synthetic_user_ratings.reset_index(inplace=True)
    synthetic_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
    synthetic_user_ratings_long = synthetic_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
    synthetic_user_ratings_long.sort_values('UserID', inplace=True)
    synthetic_user_ratings_long
    
    # save longform
    synthetic_user_ratings_long.to_pickle('synthetic_ratings_new_scraper/synthetic_ratings_'+path+'_'+number+'.pkl')
    
    # save dictionary
    with open('synthetic_ratings_new_scraper/users_dump_syntheticratings'+path+'_'+number+'.json', 'w') as convert_file:
        convert_file.write(json.dumps(synthetic_users_dictionary))