# Setup

## Notebook Preparation

### Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler#, OneHotEncoder
#from missingpy import MissForest
#from sklearn.pipeline import Pipeline
#from sklearn.compose import ColumnTransformer
#from sklearn.feature_extraction.text import TfidfTransformer

from scipy import spatial

'''
# preprocessing
from statsmodels.tsa.stattools import adfuller
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_validate, HalvingGridSearchCV, validation_curve, cross_val_score, GridSearchCV, KFold, RepeatedKFold, RandomizedSearchCV

# model tools
import statsmodels.api as sm
from statsmodels.formula.api import ols

import scipy.stats as stats
from scipy.stats import norm

from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, GammaRegressor, HuberRegressor,  Lars, Lasso, SGDRegressor
from sklearn.linear_model import LassoLars, OrthogonalMatchingPursuit, PassiveAggressiveRegressor, PoissonRegressor, RANSACRegressor, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb'''

# scoring and algorithm selection packages
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.inspection import permutation_importance

# visualization packages
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline

from surprise import KNNWithMeans, SVD, Dataset, Reader, dump, accuracy, NMF, BaselineOnly
from surprise.model_selection.validation import cross_validate
from surprise.model_selection import KFold
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp, NMF
from surprise.prediction_algorithms.slope_one import SlopeOne
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.model_Selection.search import GridSearchCV

### Notebook Functions

In [134]:
def get_user(user_matrix, user, game_ids, scaled=True):
    '''
    Takes in a sparse matrix of users and items, a specific user to retrieve, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary 
    Returns the user dictionary
    
    Inputs: 
    user_matrix: sparse matrix of users and game ratings
    user: user to retrieve
    game_ids: all possible game_ids in matrix
    
    Outputs:
    overall_user: user entry with user's game ratings
    user_mean: user's mean rating
    '''    
    # get the mean rating for that user
    user_mean = user_matrix[user].mean()
    
    if scaled:
    
        # normalize the ratings for that user by subtracting their mean from all ratings, store in list
        game_ratings = list(user_matrix.loc[user_matrix[user].notna()][user] - user_mean)
    
    else:
        game_ratings = list(user_matrix.loc[user_matrix[user].notna()][user])
    
    # Get a list of all of the game IDs that the user rated (meaning are not NaN)
    users_game_ids = list(user_matrix.loc[user_matrix[user].notna()][user].index)
    
    game_ids_set = set(game_ids).intersection(set(users_game_ids))
    
    # make a dictionary to store their ratings
    user_game_ratings = {}
    
    # for each matching key, value in game_ids and game_ratings for the user
    for key, value in zip(game_ids_set, game_ratings):
        # set as key: value in the game rating dict
        user_game_ratings[int(key)] = value
    
    # Make a dictionary to store the overall user
    overall_user = {}
      
    # store the user's ratings
    #overall_user['Ratings'] = user_game_ratings
    overall_user = user_game_ratings
    
    return overall_user, user_mean

In [133]:
def make_user_dictionaries(path_item, scaled=True):

    temp_dictionary = {}
    
    path = 'data_cleaned/ratings_matrix_cleaned_'+path_item+'.pkl'
    user_matrix = pd.read_pickle(path)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype('int32')
    user_matrix.shape
    
    for user in user_matrix.columns:
    
        #print("Starting user "+user)
    
        # copy the current user dictionary to the synthetic ratings storage dictionary
        temp_dictionary[user], user_mean = get_user(user_matrix, user, game_ids, scaled)
        user_means[user] = user_mean
    
    return temp_dictionary, user_means

In [2]:
def get_user_distances(user_a, all_users_ratings, user_items, v=10):
    '''
    Takes in a user and dictionary of other users and their game ratings. 
    Finds users_b who have an intersection of at least v rated items (both have rated the same v items). 
    Calculates the cosine distance between user_a and each qualifying user_b.
    Stores similarity to users_b in distance_dictionary and returns distance_dictionary.
    
    Inputs:
    user_a: The user that we are finding neighbors for
    all_users_ratings: all users in the system
    user_items: the items that user_a has rated
    v=10: number of items the users must have in common to have their distance scored and recorded
    
    Outputs:
    distance_dictionary: distance between user_a and other users with a set match
    
    '''
    
    # start a list for the user distances
    distance_dictionary = {}
    
    # for each user b in the dictionary:
    for user_b in all_users_ratings:
        
        if user_b == user_a:
            continue
        
        # get a list of the user b reviewed items
        other_user_items = list(all_users_ratings[user_b].keys())
        
        # determine the intersection of the items for user a and user b
        intersection_set = set.intersection(set(user_items), set(other_user_items))
        if len(intersection_set) > v:
            #print(str(len(intersection_set))+" between "+user_a+" and "+user_b)
        
            # make list to store each user a and user b ratings
            user_a_ratings = []
            user_b_ratings = []
        
            # for each item in the intersection set of mutually reviewed items
            for item in intersection_set:
            
                # append user a ratings for the items
                user_a_ratings.append(all_users_ratings[user_a][item])
                # append user b ratings for the items
                user_b_ratings.append(all_users_ratings[user_b][item])
        
            # calculate spatial distance between the two users    
            users_distance = spatial.distance.cosine(user_a_ratings,user_b_ratings)
            #print(users_distance)
        
            # append the distance to the 
            distance_dictionary[user_b] = users_distance
        
        else: continue
    
    return distance_dictionary

In [3]:
def get_estimated_ratings(distance_dictionary, user_mean, k=30):
    '''
    Takes in a distance dictionary and the user_a mean.
    Finds the k closest users
    
    Inputs: 
    distance_dictionary: dictionary of distances between user_a and each qualifying user_b
    user_mean: the rating mean of user_a
    k=30: number of neighbors to consider for making ratings predictions
    
    Outputs:
    user_predicted_ratings: Estimated item ratings for user_a
    '''
    
    # make a df from the distance dictionary and sort on distance, low to high. Take head of k
    nearest_neighbors = pd.DataFrame(distance_dictionary.values(), index=distance_dictionary.keys()).sort_values(0).head(k)
    # send df to lookup dictionary
    neighbors_lookup = nearest_neighbors.to_dict(orient='index')
    
    # make a dictionary to store the predicted ratings for the user
    user_predicted_ratings = {}
    
    # for each item in the game_ids,
    for item in game_id_lookup.keys():
        
        # change the item to a string, because the user dictionaries have string keys
        item = str(item)
        
        # set the number of ratings and base rating to 0 for this item
        num_ratings = 0
        base_rating = 0
        
        # for each neighbor in the user_a neighbor list,
        for neighbor in list(neighbors_lookup.keys()):
        
            # their distance weight is 1-their distance (low is better and a high correlation)
            distance_weight = 1- neighbors_lookup[neighbor][0]
            
            # if the item we are working on is in the neighbor's actual ratings, 
            if item in real_user_ratings_dictionary[neighbor].keys():
                
                # get the neighbor's rating
                neighbor_rating = real_user_ratings_dictionary[neighbor][item]
                
                # weight the rating by their distance_weight
                my_rating = neighbor_rating * distance_weight
                
                # add the rating to the base_ratings score for this item
                base_rating+= my_rating
                
                # add 1 to the number of ratings for this item
                num_ratings += 1
        
        # check that this item had at least 3 ratings added;
        if num_ratings>=1:
            # if so, the rating to add is the base_rating/num_ratings
            total_rating = base_rating/num_ratings
            
        # if the item had <= 3 ratings added, go to the next item
        else: continue
    
        # the estimated rating is the total rating + the user_a mean
        estimated_rating = total_rating + user_mean
        #print(estimated_rating)
        
        # if the total rating ends up over 10, set it to 10 (the max)
        if estimated_rating > 10:
            estimated_rating=10
        
        # put the predicted rating in the user predictions dictionary
        user_predicted_ratings[item] = estimated_rating
    
    # print a report about the user
    total_ratings_created = len(user_predicted_ratings)
    #print("Predicted "+str(total_ratings_created)+' ratings')
        
    return user_predicted_ratings

In [4]:
def calculate_errors(actual_dictionary, predicted_dictionary, user_mean):
    '''
    Takes in actual ratings and predictions
    Gets the intersection of items that user actually rated, and prediction for that item
    Gets MAE and RMSE of actuals vs predictions
    
    Inputs: 
    actual_dictionary: dictionary of single user's actual ratings
    predicted_dictionary: dictionary of single user's predicted ratings
    user_mean: user's mean rating
    
    Outputs:
    user_mae, user_rmse: user's MAE and RMSE
    
    '''
    
    y_actual = []
    y_preds = []
    
    real_rated_and_predicted = list(set.intersection(set(list(actual_dictionary.keys())), set(list(predicted_dictionary.keys()))))
    
    for item in real_rated_and_predicted:
        y_actual.append(actual_dictionary[item] + user_mean)
        y_preds.append(predicted_dictionary[item])  
    
    user_mae = mean_absolute_error(y_preds, y_actual)
    user_rmse = np.sqrt(mean_squared_error(y_preds, y_actual))
    
    #print(user_mae, user_rmse)
    
    return user_mae, user_rmse

In [5]:
def get_user_predictions(all_users_ratings, user_set, v=10, k=30):
    '''
    Gets predictions for a set of users
    
    For each user, makes a list of the user's reviewed items
    Calls on get_user_distances() to find other users with v items in common
    Calls on get_estimated_ratings() to get predictions based on k neighbors
    Gets MAE and RMSE on predictions for items user actually rated
    Stores all predictions to dictionary
    
    Inputs:
    all_users_ratings: all users in the system
    user_set: list of users to get predictions for
    v: (optional) number of required items in the intersection of two user rating sets
        to make neighbor calculation; sends to other function
    k: (optional) nearest neighbors to consider for recommendations; sends to other function
    
    Outputs:
    global_mae, global_rmse: MAE and RMSE for the user set
    predicted_ratings: dictionary of user predictions
    '''
    predicted_ratings = {}
    
    global_start = time.time()
    
    global_mae_list = []
    global_rmse_list = []
    
    for user_a in user_set:
    
        # report on user
        print("Calculating "+user_a)
    
        # get the user's mean rating
        user_mean = user_mean_lookup[user_a]['user_mean']
    
        # make a list of the user_a reviewed items
        user_items = list(all_users_ratings[user_a].keys())
    
        # call the get_user_distances to find the user's neighbors
        distance_dictionary = get_user_distances(user_a, all_users_ratings, user_items, v)
    
        # if no neighbors were found, report and move to the next user
        if len(distance_dictionary) ==0:
            print("Insufficient neighbors found!\n")
            continue
    
        # call get_estimated_ratings to get predictions for user_a
        user_predicted_ratings = get_estimated_ratings(distance_dictionary, user_mean, k)
    
        # store the predicted ratings for the user_a
        predicted_ratings[user_a] = user_predicted_ratings
    
        user_mae, user_rmse = calculate_errors(all_users_ratings[user_a], user_predicted_ratings, user_mean)
    
        global_mae_list.append(user_mae)
        global_rmse_list.append(user_rmse)
        
    global_mae = mean(global_mae_list)
    global_rmse = mean(global_rmse_list)
        
    print("MAE for set of users: "+str(global_mae))
    print("RMSE for set of users: "+str(global_rmse))
        
    return global_mae, global_rmse, predicted_ratings

In [6]:
def processing_pipeline(weight_groups, df):
    '''Takes in train, validation and test sets as well as lists of the cat, cont and polynomial fields, 
    as well as a list of fields to drop. Returns processed feature sets.
    
    Inputs:
    train, val, test: feature sets for train, validation and testing
    categoricals: list of categorical features
    continuous: list of continuous features
    poly: list of features that need polynomials
    drop_fields: list of features to drop after target encoding
    
    Outputs:
    processed_train, processed_val, processed_test: fully processed inputs'''

   
    # continuous pipeline
    family_encoder = Pipeline([
        ('encoder', OneHotEncoder()),
        ('scaler', MinMaxScaler(feature_range=weight_groups[6])),
         ])
    
    # Whole pipeline with continuous then categorical transformers
    total_pipeline = ColumnTransformer([
        ('games_weight_weight', MinMaxScaler(feature_range=weight_groups[0]), ['GameWeight']),
        ('rating_weight', MinMaxScaler(feature_range=weight_groups[1]), ['AvgRating']),  
        ('bayes_weight', MinMaxScaler(feature_range=weight_groups[2]), ['BayesAvgRating']),  
        ('players_weight', MinMaxScaler(feature_range=weight_groups[3]), ['BestPlayers']),
        ('playtime_weight', MinMaxScaler(feature_range=weight_groups[4]), ['Playtime']),
        ('language_weight', MinMaxScaler(feature_range=weight_groups[5]), ['LanguageEase']),
        ('remainder_weight', MinMaxScaler(feature_range=weight_groups[6]), ['Cat:Thematic', 'Cat:Strategy', 'Cat:War',
                       'Cat:Family','Cat:CGS','Cat:Abstract','Cat:Party','Cat:Childrens']),
        #('family_encoder', family_encoder, ['Family'])
                            ]) #, sparse_threshold=0
    
    # Fit and tranform the pipeline on x_train, then transform x_test
    processed = total_pipeline.fit_transform(df)
    
    return processed


## Load Data

* real_user_ratings_dictionary_scaled: dictionary of user's real ratings for games, SCALED ratings
    * format is {user: {str(item): rating}}
    
    
* real_user_ratings_dictionary_unscaled: dictionary of user's real ratings for games, UNSCALED ratings
    * format is {user: {str(item): rating}}
    
    
* real_user_ratings_long_scaled: dataframe of melted user's real ratings for games MINUS means (SCALED)
    * format is Index : UserID : BGGId : Rating
    
    
* real_user_ratings_long_unscaled: dataframe of melted user's real ratings for games
    * format is Index : UserID : BGGId : Rating
    
    
* game_id_lookup: dictionary of game names for game ids
    * format is {id: rating}
    
    
* user_mean_lookup: dictionary of user's mean rating
    * format is {user: {'user mean':mean}}
    

* train_users, val_users, test_users
    * lists of users in train/val/test

##### Split Train/Val/Test User Lists

In [10]:
users_list = list(real_user_ratings_dictionary_scaled.keys())

In [11]:
train_users, remainder = train_test_split(users_list, test_size=.1, random_state=42)
val_users, test_users = train_test_split(remainder, test_size=.5, random_state=42)

len(train_users), len(val_users), len(test_users)

(92214, 5123, 5123)

##### A little fun to get myself and friends into the validation set :)

In [12]:
'Threnody' in train_users

True

In [13]:
train_users.index('Threnody')

63406

In [14]:
train_users.pop(63406)

'Threnody'

In [15]:
val_users.insert(0, 'Threnody')

In [16]:
'Shade92008' in train_users

False

In [17]:
val_users.index('Shade92008')

2143

In [18]:
val_users.pop(2143)

'Shade92008'

In [19]:
val_users.insert(0, 'Shade92008')

In [20]:
'moosh21' in train_users

True

In [21]:
train_users.index('moosh21')

7097

In [22]:
train_users.pop(7097)

'moosh21'

In [23]:
val_users.insert(1, 'moosh21')

##### Game BGGId: Name Lookup Table

In [24]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle('data_cleaned/games.pkl')

# lists of game ids and game names
game_ids = list(games['BGGId'])
game_names = list(games['Name'])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item

    
del games
gc.collect()

game_id_lookup

{174430: 'Gloomhaven',
 161936: 'Pandemic Legacy: Season 1',
 224517: 'Brass: Birmingham',
 167791: 'Terraforming Mars',
 291457: 'Gloomhaven: Jaws of the Lion',
 233078: 'Twilight Imperium: Fourth Edition',
 220308: 'Gaia Project',
 187645: 'Star Wars: Rebellion',
 182028: 'Through the Ages: A New Story of Civilization',
 115746: 'War of the Ring: Second Edition',
 162886: 'Spirit Island',
 193738: 'Great Western Trail',
 12333: 'Twilight Struggle',
 169786: 'Scythe',
 84876: 'The Castles of Burgundy',
 173346: '7 Wonders Duel',
 120677: 'Terra Mystica',
 124361: 'Concordia',
 28720: 'Brass: Lancashire',
 167355: 'Nemesis',
 266192: 'Wingspan',
 177736: 'A Feast for Odin',
 205637: 'Arkham Horror: The Card Game',
 183394: 'Viticulture Essential Edition',
 164928: 'Orléans',
 237182: 'Root',
 96848: 'Mage Knight Board Game',
 316554: 'Dune: Imperium',
 199792: 'Everdell',
 3076: 'Puerto Rico',
 102794: 'Caverna: The Cave Farmers',
 175914: 'Food Chain Magnate',
 170216: 'Blood Rage',
 

##### User ID: Mean Lookup Table

In [25]:
# load user means
user_means = pd.read_pickle('user_means.pkl')
user_means

# send the lookup table to dict
user_mean_lookup = user_means.to_dict(orient='index')

user_mean_lookup

{'-Johnny-': {'user_mean': 5.293032786885246},
 '-LucaS-': {'user_mean': 7.717391304347826},
 '-Morphling-': {'user_mean': 7.730769230769231},
 '-mIDE-': {'user_mean': 7.017527675276753},
 '-snarf-': {'user_mean': 7.266666666666667},
 '-toni-': {'user_mean': 6.444897959183673},
 '-xXx-': {'user_mean': 7.2272727272727275},
 '...Hammer': {'user_mean': 6.512738853503185},
 '.JcK.': {'user_mean': 6.896103896103896},
 '0 1 1 2 3 5 8': {'user_mean': 6.8633027522935794},
 '00Bogey': {'user_mean': 8.043478260869565},
 '00daniel00': {'user_mean': 7.916666666666667},
 '00vito': {'user_mean': 6.983072434782608},
 '01151125': {'user_mean': 7.909433962264151},
 '015599m': {'user_mean': 7.867924528301887},
 '01lwilliams': {'user_mean': 7.773684210526315},
 '020907': {'user_mean': 7.342105263157895},
 '0447603': {'user_mean': 7.75},
 '07734': {'user_mean': 7.255102040816326},
 '0815Spieler': {'user_mean': 7.1688311688311686},
 '0CanuckEh': {'user_mean': 7.6923076923076925},
 '0Kage': {'user_mean': 7.

# Tests on Real User Sets (No Synthetic Data)

## Memory Based

Use statistical techniques on the dataset to calculate the predictions.

Steps:

* For each user_a that we want to make recommendations for:

    * Find users_b who have rated v items in common
    * for each user_b:
    
        * Calculate intersection of ratings for user_a and user_b with v minimum items
        * calculate cosine similarity between user_a and user_b
    
    * Determine k closest neighbors_k based on cosine similarity
    * for each item_i in the games dictionary,
        
        * if neighbor_k rated item_i, get their rating (weighted by their similarity)
        * predicted rating for item_i is average of all neighbor's ratings
    
    * get accuracy metrics for user_a based on predictions vs actual for user_a rated items
    * sort predicted rating items and show x top recommendations

##### User Ratings Dictionary - REAL Data only, SCALED

In [8]:
# Opening JSON file
with open('real_user_ratings_dictionary_scaled.json') as json_file:
    real_user_ratings_dictionary_scaled = json.load(json_file)

In [9]:
# preview first user in dictionary
real_user_ratings_dictionary_scaled['-Johnny-']

{'1': 1.706967213114754,
 '4098': -2.293032786885246,
 '3': -2.293032786885246,
 '22532': -0.29303278688524603,
 '5': 2.706967213114754,
 '264198': -1.293032786885246,
 '12296': -0.29303278688524603,
 '10': 0.706967213114754,
 '98315': 2.706967213114754,
 '11': 1.706967213114754,
 '8203': 1.706967213114754,
 '2060': -0.29303278688524603,
 '15': 0.706967213114754,
 '12': -1.293032786885246,
 '13': 0.706967213114754,
 '18': 0.706967213114754,
 '65556': 0.706967213114754,
 '90137': -1.293032786885246,
 '8217': -0.29303278688524603,
 '28': -0.29303278688524603,
 '73761': -0.29303278688524603,
 '2083': 0.706967213114754,
 '94246': -1.293032786885246,
 '41': 0.706967213114754,
 '71721': 0.706967213114754,
 '42': 1.706967213114754,
 '61484': -2.293032786885246,
 '45': 0.706967213114754,
 '57390': -2.293032786885246,
 '4143': 2.706967213114754,
 '20528': 0.706967213114754,
 '49': 0.706967213114754,
 '50': -2.293032786885246,
 '51': -0.29303278688524603,
 '12333': -2.293032786885246,
 '58': -0.

### Finding Optimal v and k

In [54]:
# get distance between users and store in dict
user_predictions_real_basismemory = {}

vs_and_ks = []
mae_tracker = []
rmse_tracker = []

test_set = val_users[:500]

for v in [3, 5, 10]:
    for k in [15, 30, 45, 60]:
        string = 'v'+str(v)+'_k'+str(k)
        global_mae, global_rmse, user_predictions_real_basismemory = get_user_predictions(real_user_ratings_dictionary_scaled, test_set, v=v, k=k)
        vs_and_ks.append(string)
        mae_tracker.append(global_mae)
        rmse_tracker.append(global_rmse)

Calculating Shade92008
Calculating moosh21
Calculating Threnody
Calculating gearoto
Calculating Ferrel
Calculating guingnomevere
Calculating Tsuna10
Calculating ASGupta
Calculating aruett
Calculating Cryp7o
Calculating Alferez
Calculating julcraft
Calculating jdw734
Calculating Krunk377
Calculating rkfan
Calculating DarkPadawan
Calculating Brettgillet
Calculating Jfuller
Calculating kilhk
Calculating Smythyt
Calculating iluvwatts
Calculating sharkimojo
Calculating Shiny Captain
Calculating Cheroking1
Calculating Doctor Snuggles
Calculating Ironpants
Calculating jbrodin
Calculating myood
Calculating crinklechip
Calculating Dax58rs
Calculating datdude
Calculating shelen
Calculating IluvatarIrmo
Calculating DorksGoneWild
Calculating jemfiddlesticks
Calculating edik884
Calculating raaz42
Calculating headlessdog
Calculating Nalekh
Calculating petermenkveld
Calculating badash56
Calculating pbeaver89
Calculating Stoffey
Calculating emdelaney
Calculating Ontoshkyo
Calculating gold92
Calculatin

KeyboardInterrupt: 

In [None]:
zipped = list(zip(mae_tracker, rmse_tracker))
scores = pd.DataFrame(zipped, index=vs_and_ks, columns=['MAE', 'RMSE']).sort_values('RMSE')
scores

Optimal v = 10 and k = 15

Reminders:

* user_predictions_real_basismemory: predictions based on the real data
* real_user_ratings_dictionary: the real ratings, minus the mean
* user_mean_lookup: user means lookup table
* game_id_lookup: look up game name by ID

### Get Val User Predictions

In [32]:
# get distance between users and store in dict
user_predictions_real_basismemory = {}

global_mae, global_rmse, user_predictions_real_basismemory = get_user_predictions(real_user_ratings_dictionary_scaled, val_users, v=10, k=15)

Calculating Shade92008
Calculating moosh21
Calculating Threnody
Calculating gearoto
Calculating Ferrel
Calculating guingnomevere
Calculating Tsuna10
Calculating ASGupta
Calculating aruett
Calculating Cryp7o
Calculating Alferez
Calculating julcraft
Calculating jdw734
Calculating Krunk377
Calculating rkfan
Calculating DarkPadawan
Calculating Brettgillet
Calculating Jfuller
Calculating kilhk
Calculating Smythyt
Calculating iluvwatts
Calculating sharkimojo
Calculating Shiny Captain
Insufficient neighbors found!

Calculating Cheroking1
Insufficient neighbors found!

Calculating Doctor Snuggles
Calculating Ironpants
Calculating jbrodin
Calculating myood
Calculating crinklechip
Calculating Dax58rs
Calculating datdude
Calculating shelen
Calculating IluvatarIrmo
Calculating DorksGoneWild
Calculating jemfiddlesticks
Calculating edik884
Calculating raaz42
Calculating headlessdog
Calculating Nalekh
Calculating petermenkveld
Calculating badash56
Calculating pbeaver89
Calculating Stoffey
Calculating

Calculating MrPanic
Calculating LegionGamer
Calculating plomac
Calculating manolas78
Calculating Ralzakark
Calculating aristis
Calculating Blake the Flake
Calculating ThyholyJebus77
Calculating Niveama
Calculating TradenMyr
Calculating Dashing_Pixel
Calculating WinterSpartan
Calculating Alfredas
Calculating ckhenson
Calculating zerg
Calculating krzysiekk86
Calculating Naabix
Calculating Firemaster
Calculating alyscupcakes
Insufficient neighbors found!

Calculating anomander64
Calculating Blue1389
Calculating Xae0n
Calculating rjwhite1980
Calculating chenchin
Calculating Diggah
Calculating darthpaul50
Calculating elviajedro
Calculating jc4ss
Calculating memoirfan
Calculating Borizzio
Calculating beavis
Calculating CDawgos
Calculating DanThe Man
Calculating Ksempac
Calculating mrbobthompson
Calculating code1484
Calculating JoWheely
Calculating djsyko
Calculating Nolea
Calculating Aiscool
Calculating mrraow
Calculating goddessfelice
Calculating Sherlocked85
Calculating Piewithevil
Calcula

Calculating MrDouglas
Calculating insnichts
Calculating brebee
Calculating BiscottedeLux
Calculating WGHunter
Calculating niaskywalk
Calculating mikejrhodes
Calculating aimoned
Calculating david_finlayson
Calculating Pantera
Calculating ThomasAH
Calculating Tarean
Calculating jduteau
Calculating Malst
Calculating Ugogreevy
Calculating jakoky
Calculating Unorth
Calculating Jenner30
Calculating diamondz123
Calculating Nefta777
Calculating jonathanpost
Calculating phdragon
Calculating tjy100
Calculating darkscorpio23
Calculating DutchAlex
Calculating Game Snob Mom
Calculating RichLuedeke
Calculating Catchroxheat
Calculating jmoshbdn
Calculating thegamergramps
Calculating dbb9h
Insufficient neighbors found!

Calculating Tommy_mx
Calculating Skreczi
Calculating TheArcher
Calculating Melissa0366
Calculating RVA212
Calculating Sengir86
Calculating deathbotelho
Calculating i12bnmovie
Calculating freezer50
Calculating Rayso
Calculating Luffy343
Calculating mdmccu2
Calculating Hamself
Insufficie

Calculating Schwade
Calculating derdefender
Calculating jjbidwell
Calculating Triezz
Calculating rivian16
Calculating BoesJr
Calculating PeterCSM
Calculating shackhuds0n
Calculating agichan
Calculating gimlianon
Calculating Drdrum
Calculating El Colono
Calculating Sethy295
Calculating jman122
Calculating chubaba
Calculating shadowdemon04
Calculating Jgoehle
Calculating bfeeny
Calculating JaxonE
Calculating Sh1fter
Calculating AdamCarr
Calculating bobthewarrior
Calculating shizukapy
Calculating Shade006
Calculating chrissilbereisen
Calculating rwodonnell
Calculating luna_mikichan
Calculating Dr White
Calculating Stawi12
Calculating jgross68
Calculating ConradUno
Calculating Muzzlehead
Calculating RulezNerdHerd
Insufficient neighbors found!

Calculating Husky Seahawk
Calculating thomasbefa
Calculating shasack
Calculating JessHoldcroft
Calculating jwil
Calculating Ravac
Calculating Fortheloveofgod
Calculating Tobibs
Calculating theory
Calculating bnocturnal
Calculating Mich4l
Calculating 

Calculating Wolf310
Calculating matewr84
Calculating Codex1997
Calculating zondo
Calculating Mustardman76
Calculating Ibmurai
Calculating MJYavorsky
Calculating MrAmbrosius
Calculating naughtymime
Insufficient neighbors found!

Calculating Snesko
Calculating santheo
Calculating Moongoyal
Calculating soralapio
Calculating Takao21
Calculating Teriyaki Donuts
Calculating feffi
Calculating JensOldenburg
Calculating free_the_thief
Calculating GLAWEN
Calculating bobule
Calculating Rompt
Insufficient neighbors found!

Calculating eizahn
Calculating Louis XIX
Calculating klbbeach
Calculating socpunt
Insufficient neighbors found!

Calculating pakupaku
Calculating Gauben
Calculating alexlyf
Calculating stangrossman
Calculating tiny_shelob
Calculating TomRic
Calculating Deep Chunk
Calculating manur
Calculating elchupabobra
Calculating morvais
Calculating msander
Calculating sbwilson
Calculating irondeav
Calculating Doc Beard
Calculating Cuspevet
Calculating ice2cold
Calculating pipspriller
Calcul

Calculating Tom Durin
Calculating nixdna
Calculating jbrown527
Calculating Dubyrj
Calculating gripnerd
Calculating Sooner811
Calculating Guthe
Calculating seraph77
Calculating aaleman
Calculating gbebe
Calculating johnweldy
Calculating fenrisrip25
Calculating indifisher
Calculating DuckAndCower
Calculating corruptparty
Calculating Ainulindale
Calculating Dabi
Calculating DigitalElement
Calculating mikeotis
Calculating Ezekiel
Calculating kaiji
Calculating Akindor
Calculating THORNHAVEN
Calculating Ghenov
Calculating deftonesjunkie3
Calculating computarzan
Calculating satoukazu
Insufficient neighbors found!

Calculating jerryfinegan
Calculating TheTrojanFish
Calculating TheNelsonKids
Insufficient neighbors found!

Calculating Shiresan
Calculating bk2020
Calculating bill90867
Calculating MaddMatt
Calculating Bigbrassa
Calculating JohnnyTwoSacks
Calculating FinkleKJ5
Calculating Yosamit3Sam
Calculating gxkitt
Calculating moose0003
Calculating Bugaguy
Calculating dragos_br
Calculating bran

Calculating Saragina
Calculating LordElendil
Insufficient neighbors found!

Calculating spielmann2016
Calculating bigalf82
Calculating Vampiyno
Calculating Otto W
Insufficient neighbors found!

Calculating omgfireants
Calculating Submunch
Calculating Netlimpopo
Calculating lunepremiere79
Calculating Masetheripper
Calculating montag451degrees
Calculating RBandGames
Calculating skazzz
Calculating Benjagol
Calculating citizen
Calculating Sushiboy
Calculating OleMissGirl1968
Calculating anttia
Calculating Drack80
Calculating orthros
Calculating Don Quichotte
Calculating Rao86
Calculating michacham
Calculating Red_Orc
Calculating vorakesh
Calculating wlewisiii
Calculating bolter
Calculating deMosselman
Calculating Belphegor82
Calculating lowkaseo
Calculating wamonite
Calculating DuenielSun
Calculating Gamecrafter
Calculating SlothNast
Calculating ds_ubk
Calculating TheMonkeyMan
Calculating servantoftheboll
Calculating Adamjcollege
Calculating dSword
Calculating Filometri
Calculating Postal7

Calculating Rashnu
Calculating cray
Calculating draig
Calculating luiscarlosqg
Calculating alessandra1982gdt
Calculating Vitru
Calculating Deswing
Calculating Stouthearted
Calculating Annkelia
Calculating Montpelier42
Calculating buffmeister
Calculating BlackSkye
Calculating PraetorianXVIII
Calculating Quintious
Calculating Tama
Calculating cipher43
Calculating Adika88
Calculating Radikus
Insufficient neighbors found!

Calculating Dabshire
Calculating ness
Calculating andy61
Calculating EmpressMyriam
Calculating nekobat
Calculating HatCheese1
Calculating MaksimSmelchak
Calculating nuke3D
Calculating brunswick
Calculating darkzule
Calculating Fumonkey
Calculating AaronThePaisley
Calculating PoppinfreshGWJ
Calculating shall_we_play_a_game
Calculating Hellmiu
Calculating stackey83
Calculating raitoning
Calculating Luk Van Baelen
Calculating descolado1
Calculating Never2manygames
Calculating Greecian
Calculating slateman
Calculating corinaac2012
Calculating RedladyFL
Calculating Villeneuve

Calculating Docwadewill
Calculating Sandman42
Calculating PaxCecilia
Calculating csugioka
Calculating fifteenkeys
Calculating SilverBirch
Calculating mouser2u
Insufficient neighbors found!

Calculating izack
Calculating jmreglero
Calculating Tpatts
Calculating Anzhelika
Calculating Dmoyer0219
Calculating RudyJ
Calculating Jmthomson111
Calculating breakbeatnik
Calculating AnnoDomini12
Calculating Elf__Man
Calculating akzile
Calculating SimonNScott
Calculating Seitenwahl
Calculating Johnnyboyb
Calculating Otarrec
Calculating janzgi
Calculating se6astian
Calculating TarosX
Calculating scottnjulie
Calculating funksta
Calculating worsemorebad
Calculating Crisisinlondon
Calculating MarieKurt
Calculating samrayner
Calculating SandraB80
Calculating RogaDanar
Calculating Shiftian_Be
Calculating jdgavin
Calculating matt1964
Insufficient neighbors found!

Calculating GretSeat
Calculating Coxigeno
Calculating Blitz Wing
Calculating Eleon
Calculating Smilliga
Calculating Mister_Moody
Calculating 2b

Calculating kps1ny
Calculating Biledriver
Calculating boyerling3
Calculating Guacacole
Calculating StarShipAdmiral
Calculating Trooper24
Calculating psenda3nec88
Calculating traico7
Calculating BigSolid
Calculating Koenken
Calculating TheMightyKong
Calculating vf00
Calculating sanunes
Calculating DrSmashty
Calculating Doreen Taylor
Calculating TheGingerDefender
Calculating vmartindale
Calculating grgibian
Calculating the_obvious
Calculating bigmop
Calculating iwancho
Calculating student_of_socrates
Calculating Szilgyo
Calculating Rex Dart
Calculating DerwyddNewydd
Calculating LivDanger
Calculating GeekDadGamer
Calculating nkauppila
Calculating okariot
Calculating PetulantPetunia
Calculating ridogi
Calculating JohnKean
Calculating nechasto
Calculating jspjoerg
Calculating akopoko
Calculating pawjg
Calculating fred4
Calculating jedskywalker
Calculating sgelinas
Calculating apostateant
Calculating BastianAtreyu
Calculating jwise
Calculating jp21763
Calculating Brad Sanford
Calculating Jel

Calculating bran
Calculating Forfonk
Calculating hdjensen
Calculating sdetavern
Calculating ReneSauz
Calculating mr_and_mrs_sharp
Calculating Synaesthesia
Calculating TravmacDaddy
Calculating emmersonpoole
Calculating luchau
Calculating CoralSam
Calculating mymenda
Calculating saturno3
Calculating malinmaes
Calculating topo_sepulveda
Calculating Cheshire Swift
Calculating morelju
Calculating hypeman007
Calculating jlamb
Calculating Leo_Lajs
Calculating jufinace
Calculating raydancer
Calculating baskaajis
Calculating scds
Calculating BavaroAM
Calculating MarsGodOfWar
Calculating Nikku
Calculating PierreLuc
Calculating itazzy
Calculating JamesH
Calculating bcabes
Calculating Papaver
Calculating marvelousfloyd7
Calculating Driggs
Insufficient neighbors found!

Calculating defensivearchitect
Calculating Pexstin
Calculating suzyvitale
Calculating marblemadness44
Calculating ukefish
Calculating mwelsh1118
Calculating alga
Calculating gerwalker
Calculating Phyfrou0077
Calculating lionelandrew

Calculating Finn2206
Calculating Goat1375
Calculating Soulzityr
Calculating explodinggrandma
Calculating jon_dahlberg
Calculating Narfbuster
Calculating nappuccino
Calculating ViridianJMA
Calculating ContainerJones
Calculating Gdau
Calculating raptor777777
Calculating NacheteVBGamer
Calculating Bioforge
Insufficient neighbors found!

Calculating Titus1981
Calculating Collectar
Calculating war3060
Calculating Halbwachs
Calculating benonemusic
Calculating Dennox
Calculating pmiles
Calculating GamesWeLove
Calculating EnvoyPV
Calculating Spleen
Calculating Whitefire7
Calculating quoth the raven
Calculating Ma3dhros
Calculating Bumblebeez
Calculating skluck
Calculating szazon
Calculating Mellonikus
Calculating DarthHarbison
Calculating Ibbo
Calculating Berry
Calculating hadsot
Calculating papuxo
Insufficient neighbors found!

Calculating redroadhome
Calculating gerambolosch
Calculating stijnster
Calculating Neksusu
Insufficient neighbors found!

Calculating genmaes
Calculating Crewman_6
Cal

Calculating ChapChaps21
Calculating thisisnilla
Calculating thebof
Calculating nakamura
Calculating Spicy McHagus
Calculating willyweed1
Calculating warnis
Calculating tim95030
Calculating dvdfjojo
Calculating LeZerp
Calculating coffeetable
Calculating SteadyChad
Calculating Njoltis
Calculating Prophdng
Calculating swordknight
Calculating DaMarsh
Calculating xHOLYxSNIPERx
Calculating OwenGetz
Calculating jmcraven
Calculating L2811
Calculating pauldavis92
Calculating kongkongnan
Calculating ericpetersen
Calculating GorillaGrody
Calculating ninja_nads
Calculating ChopShoey
Calculating Oliv
Calculating kpyam2000
Calculating draxx01
Calculating Faël
Calculating Ikazuchi
Calculating benwang
Calculating Xist
Calculating MeltedPriest
Calculating dogtoken
Calculating SnappyChappy
Calculating braunekristallbombe
Calculating Philip75
Calculating Merraxxess
Calculating bass1012dash
Calculating ValentineS
Calculating Rjan55
Calculating boardlessgamer
Calculating Confessor
Calculating Al Johnson
Ca

Calculating moonie
Calculating MrBlocker
Calculating statman1
Calculating MrMelton
Calculating artanis83
Calculating jonswenson
Calculating Sonodio73
Calculating stoneagewar3
Calculating dotburn
Calculating NerdCartYouTube
Calculating Onceagain
Calculating GhosterlyGus
Calculating Christophe_saeys
Calculating Master_Sgt_Bunny
Calculating faith21
Calculating Moshimon
Calculating Cid_Giraud
Calculating MadAx
Calculating ericbad
Calculating bennbatt
Calculating dgfitch
Insufficient neighbors found!

Calculating brumble666
Calculating EternalSwordsman
Calculating jp06
Calculating frodorik
Calculating chbates
Calculating ZachandWhite
Calculating ThePlayerAW
Calculating mescalin 007
Calculating JeffyJeff
Calculating sargebilko
Calculating Totallyshreded
Calculating ADHDefn
Calculating RVoisin
Calculating Meric68
Calculating skatingtortoise
Calculating MoonlightMiracle
Calculating GeekChaCha
Insufficient neighbors found!

Calculating jtrifts
Calculating utopiaxyz
Calculating khazius
Calculati

Calculating Nethog
Calculating Ton496
Calculating realdealgames
Calculating Madi29
Calculating Confusedpup
Calculating UnknownSoldier
Calculating fagentu007
Calculating Lagake
Calculating Hastur666
Calculating suntron24
Calculating Neilandlauras
Calculating Chevy Denim
Calculating Goldwing2001
Calculating theeth
Calculating gusornot
Calculating Happymoney
Calculating Lashof
Calculating mongoloid
Calculating MGBM
Calculating Rainier88
Calculating pecquereau
Calculating Actuarially
Calculating knikou
Calculating WarrenKNVB
Calculating Malicjusz
Calculating guillemxe
Calculating Dannecus
Calculating TheVest
Calculating benhackman
Insufficient neighbors found!

Calculating reyesc
Calculating Nikadim
Calculating rubiks75
Calculating Flurtis
Calculating c_polhem
Calculating mamwee
Calculating DrDirktheDaring
Calculating izimiz
Calculating kossowankenobi
Calculating illnicko
Calculating eugene256
Calculating vintage_verve
Calculating Starfighter104
Insufficient neighbors found!

Calculating D

In [33]:
global_mae, global_rmse

(0.8065224433561403, 1.0340135138998663)

In [None]:
with open('val_predictions_real_basismemory.json', 'w') as convert_file:
     convert_file.write(json.dumps(val_predictions_real_basismemory))

### Get Recommendations for a User

In [None]:
user = 'Threnody'

In [None]:
mae, rmse, this_user_predictions = get_user_predictions(real_user_ratings_dictionary_scaled, ['Threnody'], v=10, k=15)

In [None]:
predicted_items = list(this_user_predictions[user].keys())
actual_rated_items = list(real_user_ratings_dictionary_scaled[user].keys())

recommended_items = {}

recommendations_list = [item for item in predicted_items if item not in actual_rated_items]
recommendations_list

for item in recommendations_list:
    
    item_name = game_id_lookup[int(item)]
    recommended_items[item_name] = this_user_predictions[user][item]

In [None]:
recommended_items

In [None]:
pd.DataFrame(recommended_items.values(), index=recommended_items.keys(), columns=['Estimated Rating']).sort_values('Estimated Rating', ascending=False).head(30)

## Model Based

* real_user_ratings_dictionary: dictionary of user's real ratings for games
    * format is {user: {str(item): rating}}
* game_id_lookup: dictionary of game names for game ids
    * format is {id: rating}
* user_mean_lookup: dictionary of user's mean rating
    * format is {user: {'user mean':mean}}
* real_user_ratings_long: dataframe of melted user's real ratings for games MINUS means
    * format is Index : UserID : BGGId : Rating
* train_users, val_users, test_users
    * lists of users in train/val/test




Steps:

* Build trainset with all
* Fit to train set
* build trainset with VALIDATION group only
* Build anti-trainset with VALIDATION group only
* Predict on VALIIDATION trainset for metrics
* Predict on VALIIDATION anti-trainset for new
* Get accuracy metrics for actually rated
* Filter out actually rated and report on recommendations

* Hit Precision@k and recall@k ?

##### Melted User Ratings

In [86]:
# load the real user ratings melted df
real_user_ratings_long = pd.read_pickle('real_user_ratings_long.pkl')
real_user_ratings_long.head()

Unnamed: 0,UserID,BGGId,Rating
0,-Johnny-,1,1.706967
1,-Johnny-,2569,2.706967
2,-Johnny-,527,-1.293033
3,-Johnny-,55829,1.706967
4,-Johnny-,37400,-1.293033


In [87]:
real_user_ratings_long.loc[real_user_ratings_long['UserID']=='Threnody']

Unnamed: 0,UserID,BGGId,Rating
4536330,Threnody,199561,0.777515
4536331,Threnody,221194,0.777515
4536332,Threnody,116,-0.222485
4536333,Threnody,118063,0.777515
4536334,Threnody,244711,-0.722485
...,...,...,...
4536494,Threnody,232405,0.777515
4536495,Threnody,228341,-0.222485
4536496,Threnody,198994,-0.222485
4536497,Threnody,17223,0.777515


In [27]:
# check shape
real_user_ratings_long.shape

(11179216, 3)

In [72]:
#scaler = MinMaxScaler(feature_range=(1, 10))
#scaled_ratings = scaler.fit_transform(np.array(real_user_ratings_long['Rating']).reshape(-1,1))
#real_user_ratings_long['Rating'] = scaled_ratings
real_user_ratings_long.head()

Unnamed: 0,UserID,BGGId,Rating
0,-Johnny-,1,6.419345
1,-Johnny-,2569,6.926106
2,-Johnny-,527,4.89906
3,-Johnny-,55829,6.419345
4,-Johnny-,37400,4.89906


##### Make Train/Val/Test Sets

In [88]:
# make train, val, test sets
train_real_ratings = real_user_ratings_long[real_user_ratings_long['UserID'].isin(train_users)]
val_real_ratings = real_user_ratings_long[real_user_ratings_long['UserID'].isin(val_users)]
test_real_ratings = real_user_ratings_long[real_user_ratings_long['UserID'].isin(test_users)]

In [89]:
# check shape of train and test sets
train_real_ratings.shape, val_real_ratings.shape, test_real_ratings.shape

((10070160, 3), (561020, 3), (548036, 3))

In [90]:
val_real_ratings.loc[val_real_ratings['UserID']=='Threnody']

Unnamed: 0,UserID,BGGId,Rating
4536330,Threnody,199561,0.777515
4536331,Threnody,221194,0.777515
4536332,Threnody,116,-0.222485
4536333,Threnody,118063,0.777515
4536334,Threnody,244711,-0.722485
...,...,...,...
4536494,Threnody,232405,0.777515
4536495,Threnody,228341,-0.222485
4536496,Threnody,198994,-0.222485
4536497,Threnody,17223,0.777515


In [91]:
#data_reader = Reader(rating_scale=(1, 10))
data_reader = Reader()

data = Dataset.load_from_df(real_user_ratings_long[['UserID', 'BGGId', 'Rating']], data_reader)

In [92]:
del real_user_ratings_long
gc.collect()

44

### Test Different Algorithms

In [40]:
benchmark = []

In [41]:
algorithms = [SVD(), 
              SlopeOne(), 
              NormalPredictor(), 
              BaselineOnly(), 
              CoClustering(),
             ]

# Iterate over all algorithms
for algorithm in algorithms:
    
    print("New Algorithm "+str(algorithm))
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=True)
    
    # Get results & append algorithm name
    results_df = pd.DataFrame.from_dict(results).mean(axis=0)
    results_df = results_df.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(results_df)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   

New Algorithm <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x0000020DA55B9A48>
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7311  0.7315  0.7306  0.7311  0.0004  
Fit time          320.00  325.50  324.48  323.33  2.39    
Test time         41.13   40.40   45.43   42.32   2.22    
New Algorithm <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x0000020DA55B9788>
Evaluating RMSE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7269  0.7271  0.7273  0.7271  0.0002  
Fit time          145.57  141.55  150.45  145.86  3.64    
Test time         612.99  599.61  632.18  614.93  13.37   
New Algorithm <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x0000020DA5655D08>
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE 

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.720427,18.862897,40.254964
SlopeOne,0.727106,145.859146,614.926414
SVD,0.731064,323.326712,42.320422
CoClustering,0.893856,164.138983,44.080482
NormalPredictor,1.019308,9.756461,43.19012


### Fit and Predict

In [93]:
# First fit on the dataset
trainset = data.build_full_trainset()
algo = BaselineOnly()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x245a7710988>

In [94]:
val_data = Dataset.load_from_df(val_real_ratings[['UserID', 'BGGId', 'Rating']], data_reader)
val_set = val_data.build_full_trainset()

In [58]:
#del val_real_ratings
#gc.collect()

In [104]:
# predict on the validation dataset
prediction = algo.predict(uid='moosh21', iid='4099', verbose=True)

user: moosh21    item: 4099       r_ui = None   est = 1.00   {'was_impossible': False}


In [43]:
real_user_ratings_dictionary['Threnody']

{'246784': 2.7775147928994084,
 '9216': 1.7775147928994084,
 '3': -0.2224852071005916,
 '3076': -1.2224852071005916,
 '24068': -1.2224852071005916,
 '15364': 1.7775147928994084,
 '235014': 0.7775147928994084,
 '221194': 0.7775147928994084,
 '11': 0.2775147928994084,
 '10': -1.2224852071005916,
 '13': -1.122485207100592,
 '224783': -0.2224852071005916,
 '1041': -1.2224852071005916,
 '2582': 0.2775147928994084,
 '27162': 0.7775147928994084,
 '31260': -0.2224852071005916,
 '201248': 2.7775147928994084,
 '194594': 0.7775147928994084,
 '555': -0.7224852071005916,
 '3633': -0.2224852071005916,
 '50': -1.2224852071005916,
 '49': 0.7775147928994084,
 '4659': 0.7775147928994084,
 '38453': 0.7775147928994084,
 '254513': -0.2224852071005916,
 '180785': 0.7775147928994084,
 '170042': -0.2224852071005916,
 '162886': -0.2224852071005916,
 '104006': -1.2224852071005916,
 '74': 2.7775147928994084,
 '146508': -1.2224852071005916,
 '590': -0.2224852071005916,
 '201808': 0.2775147928994084,
 '163412': -0

In [84]:
real_user_ratings_dictionary['moosh21']

{'4099': 0.008780487804878057,
 '171011': 0.4087804878048784,
 '143884': 0.5087804878048781,
 '175117': -0.49121951219512194,
 '13': 0.5087804878048781,
 '224783': -1.491219512195122,
 '216092': 2.008780487804878,
 '31260': 0.008780487804878057,
 '177697': -1.491219512195122,
 '173090': 0.3087804878048779,
 '158243': -1.991219512195122,
 '100901': -0.6912195121951221,
 '143401': 1.008780487804878,
 '194607': -1.491219512195122,
 '172081': -1.991219512195122,
 '181304': 1.7087804878048782,
 '282171': 0.008780487804878057,
 '176189': 2.508780487804878,
 '140863': 0.3087804878048779,
 '194626': 1.3087804878048779,
 '189506': 1.2087804878048782,
 '162886': -1.991219512195122,
 '104006': 0.008780487804878057,
 '131144': -0.9912195121951219,
 '146508': -0.9912195121951219,
 '262733': 4.008780487804878,
 '211534': 1.508780487804878,
 '142924': 0.008780487804878057,
 '137297': -1.991219512195122,
 '160851': 0.008780487804878057,
 '163413': 0.7087804878048782,
 '248918': -0.49121951219512194,
 

In [41]:
# predict on the validation dataset
rated_val_predictions = algo.test(val_set)

TypeError: 'Trainset' object is not iterable

In [37]:
# make the antitrain set 
anti_val_set = val_set.build_anti_testset()
# predict on all items not predicted
unrated_val_predictions = algo.test(anti_val_set)

KeyboardInterrupt: 

### TO DO Revert Predictions Transormations

* inverse_transform with the MinMaxScaler
* Add the mean for the BGGId

# Tests with Synthetic Data

In [None]:
# Opening JSON file
with open('synth_user_ratings_dictionary.json') as json_file:
    synth_user_ratings_dictionary = json.load(json_file)

##### Melted User Ratings

In [20]:
# load the real user ratings melted df
synth_user_ratings_long = pd.read_pickle('synth_user_ratings_long.pkl')
synth_user_ratings_long.head()

Unnamed: 0,UserID,BGGId,Rating
0,-Johnny-,1,1.706967
1,-Johnny-,2569,2.706967
2,-Johnny-,527,-1.293033
3,-Johnny-,55829,1.706967
4,-Johnny-,37400,-1.293033


In [22]:
# check shape
synth_user_ratings_long.shape

(11179216, 3)

##### Make Train/Val/Test Sets

In [23]:
# make train, val, test sets
train_synth_ratings = synth_user_ratings_long[synth_user_ratings_long['UserID'].isin(train_users)]
val_synth_ratings = synth_user_ratings_long[synth_user_ratings_long['UserID'].isin(val_users)]
test_synth_ratings = synth_user_ratings_long[synth_user_ratings_long['UserID'].isin(test_users)]

In [24]:
# check shape of train and test sets
train_synth_ratings.shape, val_synth_ratings.shape, test_synth_ratings.shape

((10070365, 3), (560815, 3), (548036, 3))

## Memory Based

### Get Val User Predictions

In [None]:
# get distance between users and store in dict
user_predictions_synth_basismemory = {}

global_mae, global_rmse, user_predictions_synth_basismemory = get_user_predictions(synth_user_ratings_dictionary, val_users, v=v, k=k)

### Get Recommendations for a User

In [None]:
user = 'Threnody'

In [None]:
predicted_items = list(user_predictions_synth_basismemory[user].keys())
actual_rated_items = list(real_user_ratings_dictionary[user].keys())

recommended_items = {}

recommendations_list = [item for item in predicted_items if item not in actual_rated_items]
recommendations_list

for item in recommendations_list:
    
    item_name = game_id_lookup[int(item)]
    recommended_items[item_name] = user_predictions_synth_basismemory[user][item]

In [None]:
pd.DataFrame(recommended_items.values(), index=recommended_items.keys(), columns=['Estimated Rating']).sort_values('Estimated Rating', ascending=False).head(30)

## Model Based

Steps:

* Build trainset with all
* Fit to train set
* build trainset with VALIDATION group only
* Build anti-trainset with VALIDATION group only
* Predict on VALIIDATION trainset for metrics
* Predict on VALIIDATION anti-trainset for new
* Get accuracy metrics for actually rated
* Filter out actually rated and report on recommendations

* Hit Precision@k and recall@k ?

In [None]:
scaler = MinMaxScaler(feature_range=(1, 10))
scaled_ratings = scaler.fit_transform(np.array(synth_user_ratings_long['Rating']).reshape(-1,1))
synth_user_ratings_long['Rating'] = scaled_ratings
synth_user_ratings_long.head()

In [None]:
data_reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(synth_user_ratings_long[['UserID', 'BGGId', 'Rating']], data_reader)

In [None]:
del synth_user_ratings_long
gc.collect()

In [None]:
benchmark = []

In [None]:
algorithms = [SVD(), 
              SlopeOne(), 
              NormalPredictor(), 
              BaselineOnly(), 
              CoClustering(),
             ]

# Iterate over all algorithms
for algorithm in algorithms:
    
    print("New Algorithm "+str(algorithm))
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=True)
    
    # Get results & append algorithm name
    results_df = pd.DataFrame.from_dict(results).mean(axis=0)
    results_df = results_df.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(results_df)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   

In [None]:
bsl_options = {'method': 'sgd'}

algorithms = [BaselineOnly(bsl_options=bsl_options)
             ]

# Iterate over all algorithms
for algorithm in algorithms:
    
    print("New Algorithm "+str(algorithm))
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    results_df = pd.DataFrame.from_dict(results).mean(axis=0)
    results_df = results_df.append(pd.Series(['Baseline_SGD'], index=['Algorithm']))
    benchmark.append(results_df)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   

In [None]:
# First fit on the dataset
trainset = data.build_full_trainset()
algo = BaselineOnly()
algo.fit(trainset)

In [None]:
# scale transform the validation set ratings
scaled_ratings = scaler.transform(np.array(val_synth_ratings['Rating']).reshape(-1,1))
val_synth_ratings['Rating'] = scaled_ratings
val_synth_ratings.head()

In [None]:
val_data = Dataset.load_from_df(val_synth_ratings[['UserID', 'BGGId', 'Rating']], data_reader)

val_set = val_data.build_full_trainset()

# predict on the validation dataset
rated_val_predictions = algo.predict(val_set)

# make the antitrain set 
anti_val_set = val_set.build_anti_testset()
# predict on all items not predicted
unrated_val_predictions = algo.predict(anti_val_set)

# Analysis

To Do: Which hybrid system am I using ?

# Future Work

##### Deployment

GUI to allow user to enter BGG user id, and provide list of recommendations

##### What if user has no BGG id or no rated items on BGG?

Cold-start survey is needed to pick up on some user basics in order to populate initial synthetic user matrix

    * Ask user what kinds of games they like - offer checkboxes
    * ask them to rate some of the top games in that category
    


Allow for the user to specify the things that are important to them

# Appendix

In [None]:
break