# Setup

## Notebook Preparation

### Package Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json
from statistics import mean
import copy

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

### Notebook Functions

In [2]:
def get_user(user_matrix, user, game_ids, scaled=True):
    '''
    Takes in a sparse matrix of users and items, a specific user to retrieve, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_matrix: sparse matrix of users and game ratings
    user: user to retrieve
    game_ids: all possible game_ids in matrix
    
    Outputs:
    overall_user: user entry with user's game ratings
    user_mean: user's mean rating
    '''
    
    # get the mean rating for that user
    user_mean = user_matrix[user].mean()
    
    if scaled:
    
        # normalize the ratings for that user by subtracting their mean from all ratings, store in list
        game_ratings = list(user_matrix.loc[user_matrix[user].notna()][user] - user_mean)
    
    else:
        game_ratings = list(user_matrix.loc[user_matrix[user].notna()][user])
    
    # Get a list of all of the game IDs that the user rated (meaning are not NaN)
    users_game_ids = list(user_matrix.loc[user_matrix[user].notna()][user].index)
    
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make a dictionary to store their ratings
    user_game_ratings = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for key, value in zip(game_ids_set, game_ratings):
        # set as key: value in the game rating dict
        user_game_ratings[int(key)] = value
    
    # Make a dictionary to store the overall user
    overall_user = {}
      
    # store the user's ratings
    #overall_user['Ratings'] = user_game_ratings
    overall_user = user_game_ratings
    
    return overall_user, user_mean

In [3]:
def make_user_dictionaries(path_item, scaled=True):

    temp_dictionary = {}
    
    path = 'data_cleaned/ratings_matrix_cleaned_'+path_item+'.pkl'
    user_matrix = pd.read_pickle(path)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    user_matrix.shape
    
    for user in user_matrix.columns:
    
        #print("Starting user "+user)
    
        # copy the current user dictionary to the synthetic ratings storage dictionary
        temp_dictionary[user], user_mean = get_user(user_matrix, user, game_ids, scaled)
        user_means[user] = user_mean
    
    return temp_dictionary, user_means

# Build Dictionaries of Real Ratings

Don't run this stuff again!!

## Unscaled Ratings

In [105]:
real_user_ratings_dictionary_unscaled = {}

In [116]:
temp_dictionary01, user_means01 = make_user_dictionaries('01', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary01).T
real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_01 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_01.sort_values('UserID', inplace=True)
real_user_ratings_long_01.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

NameError: name 'temp_dictionary' is not defined

In [117]:
temp_dictionary02, user_means02 = make_user_dictionaries('02', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary02).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_02 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_02.sort_values('UserID', inplace=True)
real_user_ratings_long_02.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [118]:
temp_dictionary03, user_means03 = make_user_dictionaries('03', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary03).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_03 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_03.sort_values('UserID', inplace=True)
real_user_ratings_long_03.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [119]:
temp_dictionary04, user_means04 = make_user_dictionaries('04', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary04).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_04 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_04.sort_values('UserID', inplace=True)
real_user_ratings_long_04.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [120]:
temp_dictionary05, user_means05 = make_user_dictionaries('05', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary05).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_05 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_05.sort_values('UserID', inplace=True)
real_user_ratings_long_05.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [121]:
temp_dictionary06, user_means06 = make_user_dictionaries('06', scaled=False)

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary06).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_06 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_06.sort_values('UserID', inplace=True)
real_user_ratings_long_06.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

### Lookup Dictionary

In [122]:
real_user_ratings_dictionary_unscaled.update(temp_dictionary01)
real_user_ratings_dictionary_unscaled.update(temp_dictionary02)
real_user_ratings_dictionary_unscaled.update(temp_dictionary03)
real_user_ratings_dictionary_unscaled.update(temp_dictionary04)
real_user_ratings_dictionary_unscaled.update(temp_dictionary05)
real_user_ratings_dictionary_unscaled.update(temp_dictionary06)

In [125]:
len(real_user_ratings_dictionary_unscaled)

102460

In [128]:
with open('real_user_ratings_dictionary_unscaled.json', 'w') as convert_file:
     convert_file.write(json.dumps(real_user_ratings_dictionary_unscaled))

### Longform DF

In [124]:
real_user_ratings_long_unscaled = pd.concat([real_user_ratings_long_01, real_user_ratings_long_02, real_user_ratings_long_03, real_user_ratings_long_04, real_user_ratings_long_05, real_user_ratings_long_06], ignore_index=True)

In [127]:
real_user_ratings_long_unscaled.shape

(11424255, 3)

In [129]:
real_user_ratings_long_unscaled.to_pickle('real_user_ratings_long_unscaled.pkl')

## Scaled Ratings

In [135]:
temp_dictionary01, user_means01 = make_user_dictionaries('01')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary01).T
real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_01 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_01.sort_values('UserID', inplace=True)
real_user_ratings_long_01.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [136]:
temp_dictionary02, user_means02 = make_user_dictionaries('02')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary02).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_02 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_02.sort_values('UserID', inplace=True)
real_user_ratings_long_02.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

0

In [None]:
temp_dictionary03, user_means03 = make_user_dictionaries('03')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary03).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_03 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_03.sort_values('UserID', inplace=True)
real_user_ratings_long_03.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

In [None]:
temp_dictionary04, user_means04 = make_user_dictionaries('04')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary04).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_04 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_04.sort_values('UserID', inplace=True)
real_user_ratings_long_04.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

In [None]:
temp_dictionary05, user_means05 = make_user_dictionaries('05')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary05).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_05 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_05.sort_values('UserID', inplace=True)
real_user_ratings_long_05.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

In [None]:
temp_dictionary06, user_means06 = make_user_dictionaries('06')

real_user_ratings = pd.DataFrame.from_dict(temp_dictionary06).T

real_user_ratings.reset_index(inplace=True)
real_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
real_user_ratings_long_06 = real_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
real_user_ratings_long_06.sort_values('UserID', inplace=True)
real_user_ratings_long_06.reset_index(drop=True, inplace=True)

del real_user_ratings

gc.collect()

### Lookup Dictionary

In [122]:
real_user_ratings_dictionary_scaled = {}
real_user_ratings_dictionary_scaled.update(temp_dictionary01)
real_user_ratings_dictionary_scaled.update(temp_dictionary02)
real_user_ratings_dictionary_scaled.update(temp_dictionary03)
real_user_ratings_dictionary_scaled.update(temp_dictionary04)
real_user_ratings_dictionary_scaled.update(temp_dictionary05)
real_user_ratings_dictionary_scaled.update(temp_dictionary06)

In [125]:
len(real_user_ratings_dictionary_scaled)

102460

In [None]:
with open('real_user_ratings_dictionary_scaled.json', 'w') as convert_file:
     convert_file.write(json.dumps(real_user_ratings_dictionary_scaled))

In [None]:
# Opening JSON file
with open('real_user_ratings_dictionary_dump.json') as json_file:
    real_user_ratings_dictionary = json.load(json_file)

### Longform DF

In [None]:
real_user_ratings_long_scaled = pd.concat([real_user_ratings_long_01, real_user_ratings_long_02, real_user_ratings_long_03, real_user_ratings_long_04, real_user_ratings_long_05, real_user_ratings_long_06], ignore_index=True)

In [None]:
real_user_ratings_long_scaled.shape

In [None]:
real_user_ratings_long_scaled.to_pickle('real_user_ratings_long_scaled.pkl')

# Build Dictionary of Synthetic Ratings

## Scaled Ratings

### Lookup Dictionary

In [10]:
# Opening JSON file
with open('synthetic_ratings/users_dump_syntheticratings01.json') as json_file:
    users_dump_syntheticratings01 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings02.json') as json_file:
    users_dump_syntheticratings02 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings03.json') as json_file:
    users_dump_syntheticratings03 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings04.json') as json_file:
    users_dump_syntheticratings04 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings05.json') as json_file:
    users_dump_syntheticratings05 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings06.json') as json_file:
    users_dump_syntheticratings06 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings07.json') as json_file:
    users_dump_syntheticratings07 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings08.json') as json_file:
    users_dump_syntheticratings08 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings09.json') as json_file:
    users_dump_syntheticratings09 = json.load(json_file)

In [12]:
synth_user_ratings_dictionary_scaled = {}
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings01)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings02)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings03)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings04)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings05)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings06)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings07)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings08)
synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings09)

In [13]:
len(synth_user_ratings_dictionary_scaled)

102460

In [14]:
with open('synth_user_ratings_dictionary_scaled.json', 'w') as convert_file:
     convert_file.write(json.dumps(synth_user_ratings_dictionary_scaled))

In [15]:
del users_dump_syntheticratings01
del users_dump_syntheticratings02
del users_dump_syntheticratings03
del users_dump_syntheticratings04
del users_dump_syntheticratings05
del users_dump_syntheticratings06
del users_dump_syntheticratings07
del users_dump_syntheticratings08
del users_dump_syntheticratings09

del synth_user_ratings_dictionary_scaled

gc.collect()

21

### Longform DF

In [16]:
synthetic01 = pd.read_pickle('synthetic_ratings/synthetic_ratings_01.pkl')
synthetic02 = pd.read_pickle('synthetic_ratings/synthetic_ratings_02.pkl')
synthetic03 = pd.read_pickle('synthetic_ratings/synthetic_ratings_03.pkl')
synthetic04 = pd.read_pickle('synthetic_ratings/synthetic_ratings_04.pkl')
synthetic05 = pd.read_pickle('synthetic_ratings/synthetic_ratings_05.pkl')
synthetic06 = pd.read_pickle('synthetic_ratings/synthetic_ratings_06.pkl')
synthetic07 = pd.read_pickle('synthetic_ratings/synthetic_ratings_07.pkl')
synthetic08 = pd.read_pickle('synthetic_ratings/synthetic_ratings_08.pkl')
synthetic09 = pd.read_pickle('synthetic_ratings/synthetic_ratings_09.pkl')

In [17]:
synth_user_ratings_long_scaled = pd.concat([synthetic01, synthetic02, synthetic03, synthetic04, synthetic05, synthetic06, synthetic07, synthetic08, synthetic09], ignore_index=True)

In [19]:
synth_user_ratings_long_scaled.shape

(53062675, 3)

In [20]:
synth_user_ratings_long_scaled.to_pickle('synth_user_ratings_long_scaled.pkl')

In [21]:
del synthetic01
del synthetic02
del synthetic03
del synthetic04
del synthetic05
del synthetic06
del synthetic07
del synthetic08
del synthetic09

del synth_user_ratings_long_scaled

gc.collect()

84

## Unscaled Ratings

In [4]:
with open('users_dump_means.json') as json_file:
    user_means = json.load(json_file)

### Lookup Dictionary

In [5]:
# Opening JSON file
with open('synthetic_ratings/users_dump_syntheticratings01.json') as json_file:
    users_dump_syntheticratings01 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings02.json') as json_file:
    users_dump_syntheticratings02 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings03.json') as json_file:
    users_dump_syntheticratings03 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings04.json') as json_file:
    users_dump_syntheticratings04 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings05.json') as json_file:
    users_dump_syntheticratings05 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings06.json') as json_file:
    users_dump_syntheticratings06 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings07.json') as json_file:
    users_dump_syntheticratings07 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings08.json') as json_file:
    users_dump_syntheticratings08 = json.load(json_file)
with open('synthetic_ratings/users_dump_syntheticratings09.json') as json_file:
    users_dump_syntheticratings09 = json.load(json_file)

In [6]:
for user in users_dump_syntheticratings01:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings01[user]:
        users_dump_syntheticratings01[user][item] += user_mean   

In [7]:
for user in users_dump_syntheticratings02:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings02[user]:
        users_dump_syntheticratings02[user][item] += user_mean

In [8]:
for user in users_dump_syntheticratings03:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings03[user]:
        users_dump_syntheticratings03[user][item] += user_mean

In [9]:
for user in users_dump_syntheticratings04:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings04[user]:
        users_dump_syntheticratings04[user][item] += user_mean

In [10]:
for user in users_dump_syntheticratings05:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings05[user]:
        users_dump_syntheticratings05[user][item] += user_mean

In [11]:
for user in users_dump_syntheticratings06:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings06[user]:
        users_dump_syntheticratings06[user][item] += user_mean

In [12]:
for user in users_dump_syntheticratings07:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings07[user]:
        users_dump_syntheticratings07[user][item] += user_mean

In [13]:
for user in users_dump_syntheticratings08:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings08[user]:
        users_dump_syntheticratings08[user][item] += user_mean

In [14]:
for user in users_dump_syntheticratings09:
    user_mean = user_means[user]
    for item in users_dump_syntheticratings09[user]:
        users_dump_syntheticratings09[user][item] += user_mean

In [15]:
synth_user_ratings_dictionary_unscaled = {}
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings01)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings02)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings03)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings04)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings05)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings06)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings07)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings08)
synth_user_ratings_dictionary_unscaled.update(users_dump_syntheticratings09)

In [17]:
del users_dump_syntheticratings01
del users_dump_syntheticratings02
del users_dump_syntheticratings03
del users_dump_syntheticratings04
del users_dump_syntheticratings05
del users_dump_syntheticratings06
del users_dump_syntheticratings07
del users_dump_syntheticratings08
del users_dump_syntheticratings09
del user_means

gc.collect()

84

In [20]:
with open('synth_user_ratings_dictionary_unscaled.json', 'w') as convert_file:
     convert_file.write(json.dumps(synth_user_ratings_dictionary_unscaled))

MemoryError: 

### Longform DF

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings01).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_01 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_01.sort_values('UserID', inplace=True)
synth_user_ratings_long_01.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings02).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_02 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_02.sort_values('UserID', inplace=True)
synth_user_ratings_long_02.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings03).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_03 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_03.sort_values('UserID', inplace=True)
synth_user_ratings_long_03.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings04).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_04 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_04.sort_values('UserID', inplace=True)
synth_user_ratings_long_04.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings05).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_05 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_05.sort_values('UserID', inplace=True)
synth_user_ratings_long_05.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings06).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_06 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_06.sort_values('UserID', inplace=True)
synth_user_ratings_long_06.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings07).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_07 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_07.sort_values('UserID', inplace=True)
synth_user_ratings_long_07.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings08).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_08 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_08.sort_values('UserID', inplace=True)
synth_user_ratings_long_08.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [None]:
synth_user_ratings = pd.DataFrame.from_dict(users_dump_syntheticratings09).T

synth_user_ratings.reset_index(inplace=True)
synth_user_ratings.rename(columns={'index':'UserID'}, inplace=True)
synth_user_ratings_long_09 = synth_user_ratings.melt(id_vars='UserID', var_name='BGGId', value_name='Rating').dropna()
synth_user_ratings_long_09.sort_values('UserID', inplace=True)
synth_user_ratings_long_09.reset_index(drop=True, inplace=True)

del synth_user_ratings

gc.collect()

In [17]:
synth_user_ratings_long_unscaled = pd.concat([synth_user_ratings_long_01, synth_user_ratings_long_02, synth_user_ratings_long_03, synth_user_ratings_long_04, synth_user_ratings_long_05, synth_user_ratings_long_06, synth_user_ratings_long_07, synth_user_ratings_long_08, synth_user_ratings_long_09], ignore_index=True)

In [None]:
synth_user_ratings_long_unscaled.shape

In [None]:
synth_user_ratings_long_unscaled.to_pickle('synth_user_ratings_long_unscaled.pkl')

# Build User Means Dictionary

In [None]:
user_means = {}

user_means.update(user_means01)
user_means.update(user_means02)
user_means.update(user_means03)
user_means.update(user_means04)
user_means.update(user_means05)
user_means.update(user_means06)

In [None]:
with open('users_dump_means.json', 'w') as convert_file:
     convert_file.write(json.dumps(user_means))

In [None]:
del real_user_ratings
del real_user_ratings_long
gc.collect()

In [None]:
user_means = pd.DataFrame.from_dict(user_means, columns=['user_mean'], orient='index')

In [None]:
user_means.to_pickle('user_means.pkl')

In [22]:
with open('users_dump_means.json') as json_file:
    user_means = json.load(json_file)

In [23]:
user_means['-Johnny-']

5.293032786885246