# Setup

## Notebook Preparation

### Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from missingpy import MissForest
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer



'''
# preprocessing
from statsmodels.tsa.stattools import adfuller
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_validate, HalvingGridSearchCV, validation_curve, cross_val_score, GridSearchCV, KFold, RepeatedKFold, RandomizedSearchCV

# model tools
import statsmodels.api as sm
from statsmodels.formula.api import ols

import scipy.stats as stats
from scipy.stats import norm

from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, GammaRegressor, HuberRegressor,  Lars, Lasso, SGDRegressor
from sklearn.linear_model import LassoLars, OrthogonalMatchingPursuit, PassiveAggressiveRegressor, PoissonRegressor, RANSACRegressor, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb

# scoring and algorithm selection packages
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.inspection import permutation_importance'''

# visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from surprise import KNNWithMeans, SVD, Dataset, Reader, dump, accuracy
from surprise.model_selection.validation import cross_validate
from surprise.model_selection import KFold

### Notebook Functions

##### Processing Functions

In [None]:
def processing_pipeline(weight_groups, df):
    '''Takes in train, validation and test sets as well as lists of the cat, cont and polynomial fields, 
    as well as a list of fields to drop. Returns processed feature sets.
    
    Inputs:
    train, val, test: feature sets for train, validation and testing
    categoricals: list of categorical features
    continuous: list of continuous features
    poly: list of features that need polynomials
    drop_fields: list of features to drop after target encoding
    
    Outputs:
    processed_train, processed_val, processed_test: fully processed inputs'''

   
    # continuous pipeline
    family_encoder = Pipeline([
        ('encoder', OneHotEncoder()),
        ('scaler', MinMaxScaler(feature_range=weight_groups[6])),
         ])
    
    # Whole pipeline with continuous then categorical transformers
    total_pipeline = ColumnTransformer([
        ('games_weight_weight', MinMaxScaler(feature_range=weight_groups[0]), ['GameWeight']),
        ('rating_weight', MinMaxScaler(feature_range=weight_groups[1]), ['AvgRating']),  
        ('bayes_weight', MinMaxScaler(feature_range=weight_groups[2]), ['BayesAvgRating']),  
        ('players_weight', MinMaxScaler(feature_range=weight_groups[3]), ['BestPlayers']),
        ('playtime_weight', MinMaxScaler(feature_range=weight_groups[4]), ['Playtime']),
        ('language_weight', MinMaxScaler(feature_range=weight_groups[5]), ['LanguageEase']),
        ('remainder_weight', MinMaxScaler(feature_range=weight_groups[6]), ['Cat:Thematic', 'Cat:Strategy', 'Cat:War',
                       'Cat:Family','Cat:CGS','Cat:Abstract','Cat:Party','Cat:Childrens']),
        #('family_encoder', family_encoder, ['Family'])
                            ]) #, sparse_threshold=0
    
    # Fit and tranform the pipeline on x_train, then transform x_test
    processed = total_pipeline.fit_transform(df)
    
    return processed


In [None]:
def tfidf_dataset(dataset, weights, transpose_toggle=True):
    
    # drop BGG Id
    try: 
        dataset_pared = dataset.drop('BGGId', axis=1)
    except: 
        dataset_pared = dataset
    
    # get list of titles to reapply to DF after transformation
    titles = list(dataset_pared.columns)

    # set up weighted scaler
    scaler = MinMaxScaler(feature_range = weights)
    
    if transpose_toggle:
        #instantiate tfidf transformer
        tfidf = TfidfTransformer()
    
        #convert matrix to tfidf 
        tfidf_dataset = pd.DataFrame(tfidf.fit_transform(dataset_pared).toarray(), columns=titles)
    
        # run scaler on transpose (scale by row not column)
        transpose_scaled = scaler.fit_transform(tfidf_dataset.T)
    
        # rebuild data frame
        scaled_dataset = pd.DataFrame(transpose_scaled.T, columns=titles)
    
    else: 
        scaled_dataset = pd.DataFrame(scaler.fit_transform(dataset_pared), columns=titles)
    
    return scaled_dataset

In [None]:
def scale_dataset(dataset, weights, transpose_toggle=True):
    
    # drop BGG Id
    try: 
        dataset_pared = dataset.drop('BGGId', axis=1)
    except: 
        dataset_pared = dataset
    
    # get list of titles to reapply to DF after transformation
    titles = list(dataset_pared.columns)

    # set up weighted scaler
    scaler = MinMaxScaler(feature_range = weights)

    total_entries = sum(dataset.sum())
    
    for item in list(dataset_pared.columns):
        dataset_pared.loc[dataset_pared[item]>0, item] = dataset_pared[item].sum()/total_entries
    
    transpose_scaled = scaler.fit_transform(dataset_pared.T)
    
    scaled_dataset = pd.DataFrame(transpose_scaled.T, columns=titles)
    
    return scaled_dataset


# Content Based Filtering

## TO DO

Fix duplicate game names (for example, Coup)

## Set Weights

A weight set I am very happy with

games_weight_weight = (0, 1)
rating_weight = (0, 1)
bayes_weight = (0, 1)
players_weight = (0, .5)
playtime_weight = (0, .75) 
language_weight = (0, .25)
mechanics_weight = (0, .5)
designers_weight = (0, .5)
#publisher_weight = (0, .1)
categories_weight = (0, .5)
#artist_weights = (0, .25)
#awards_weights = (0, .25)
family_weights = (0, .75)

In [None]:
games_weight_weight = (0, 1)
rating_weight = (0, 1)
bayes_weight = (0, 1)
players_weight = (0, .5)
playtime_weight = (0, .75) 
language_weight = (0, .25)
mechanics_weight = (0, .5)
designers_weight = (0, .5)
#publisher_weight = (0, .1)
categories_weight = (0, .5)
#artist_weights = (0, .25)
#awards_weights = (0, .25)
family_weights = (0, .75)

## Load and Prep Data

In [None]:
games = pd.read_pickle('data_cleaned/games.pkl')

games['Playtime'] = 0
games['Playtime'] = games.apply(lambda x: np.mean(x['ComMinPlaytime'] + x['ComMaxPlaytime']), axis=1)

over_6_hours = list(games.loc[games['Playtime']>360].index)
games.loc[over_6_hours, 'Playtime']=360


mechanics = pd.read_pickle('data_cleaned/mechanics.pkl')
designers = pd.read_pickle('data_cleaned/designers_reduced.pkl')
publishers = pd.read_pickle('data_cleaned/publishers_reduced.pkl')
artists = pd.read_pickle('data_cleaned/artists_reduced.pkl')
awards = pd.read_pickle('data_cleaned/awards_reduced.pkl')


games.head()

### TF-IDF and Scale Datasets

##### Clean up mechanics

In [None]:
# Clean up mechanics

auction_list = ['Auction: Dexterity','Auction: Dutch','Auction: Dutch Priority',
                'Auction: Fixed Placement','Auction: English','Auction: Once Around','Auction: Sealed Bid',
                'Auction: Turn Order Until Pass','Multiple-Lot Auction','Closed Economy Auction','Selection Order Bid',
                'Constrained Bidding']

turn_order_list = ['Turn Order: Auction','Turn Order: Claim Action','Turn Order: Pass Order',
                   'Turn Order: Progressive','Turn Order: Random','Turn Order: Role Order','Turn Order: Stat-Based']

dumb_physical_list = ['Acting','Hot Potato','Singing','Rock-Paper-Scissors']

drafting = ['Card Drafting']

legacy = ['Legacy']

worker_placement = ['Worker Placement with Dice Workers','Worker Placement, Different Worker Types'] #'Worker Placement',

for item in worker_placement:
    mechanics.loc[mechanics[item]==1, 'Worker Placement'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)

for item in auction_list:
    mechanics.loc[mechanics[item]==1, 'Auction/Bidding'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)

mechanics['Physical'] = int(0)
for item in dumb_physical_list:
    mechanics.loc[mechanics[item]==1, 'Physical'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)
    
mechanics.loc[mechanics['Card Drafting']==1, 'Drafting'] = int(1)

mechanics.loc[mechanics['Legacy']==1, 'Legacy Game'] = int(1)

mechanics.drop(turn_order_list, axis=1, inplace=True)
mechanics.drop(['Card Drafting','Legacy'], axis=1, inplace=True)

In [None]:
# scaled mechanics
scaled_mechanics = tfidf_dataset(mechanics, mechanics_weight)

# make new column for games without any mechanics information
no_mechanics_index = list(scaled_mechanics.loc[scaled_mechanics.sum(axis=1)==0].index)
scaled_mechanics['No Mechanics'] = 0
scaled_mechanics.loc[no_mechanics_index, 'No Mechanics'] = (1/mechanics_weight[1])

scaled_mechanics.head(10)

##### TF-Scale Datasets

In [None]:
# scaled awards through tdidf/scaler
#scaled_awards = scale_dataset(awards, awards_weights)

# scaled designers
scaled_designers = scale_dataset(designers, designers_weight)

# scaled publishers
#scaled_publishers = scale_dataset(publishers, publisher_weight)

# scaled artists
#scaled_artists = scale_dataset(artists, artist_weights)

# scaled game families
game_families = pd.get_dummies(games['Family'])
scaled_families = scale_dataset(game_families, family_weights)

scaled_designers.head(10)

### Master CBF Frame

In [None]:
games_included_columns = ['GameWeight', 'AvgRating', 'BayesAvgRating', 'BestPlayers', 'Playtime', 'LanguageEase',  'Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']

scaled_games = games[games_included_columns]
game_names = list(games['Name'])
game_ids = list(games['BGGId'])

game_lookup = {}
for key, value in zip(game_names, game_ids):
    game_lookup[key] = value

imputer = MissForest()
scaled_games = pd.DataFrame(imputer.fit_transform(scaled_games), columns=games_included_columns)

In [None]:
weight_groups = [games_weight_weight, rating_weight, bayes_weight, players_weight, playtime_weight, language_weight, categories_weight, family_weights]
scaled_games = pd.DataFrame(processing_pipeline(weight_groups, scaled_games), columns=games_included_columns)

master_games = pd.concat((scaled_games, scaled_mechanics, scaled_families, scaled_designers), axis=1)
# , scaled_artists, scaled_awards, , scaled_publishers

game_and_id = list(zip(game_names, game_ids))
master_games['Name'] = game_names

master_games.set_index('Name', inplace=True)

master_games.head(10)

In [None]:
master_games.loc[['Pandemic', 'Pandemic: Reign of Cthulhu', 'World of Warcraft: Wrath of the Lich King', 'Pandemic: Fall of Rome', 'Pandemic Legacy: Season 1', 'Pandemic Legacy: Season 0']]

## Item Similarity via Cosine Distance

In [None]:
cosine_sims = cosine_similarity(master_games)

In [None]:
sims_byname = pd.DataFrame(cosine_sims, columns=game_names)
sims_byname['Game_Name'] = game_names
sims_byname.set_index('Game_Name', inplace=True, drop=True)
sims_byname

In [None]:
sims_byid = pd.DataFrame(cosine_sims, columns=game_ids)
sims_byid['Game_Id'] = game_ids
sims_byid.set_index('Game_Id', inplace=True, drop=True)
sims_byid

In [None]:
scaler = MinMaxScaler(feature_range = (-1, 1))

scaled_comps = pd.DataFrame(scaler.fit_transform(sims_byid), columns=game_ids)
scaled_comps['Game_Id'] = game_ids
scaled_comps.set_index('Game_Id', inplace=True, drop=True)
scaled_comps

In [None]:
#sims_byname.to_pickle('data_cleaned/game_cosine_similarity.pkl')

### CHECK GAME HERE

In [None]:
game_lookup['Pandemic Legacy: Season 0']

In [None]:
test_dict = {'Dominion':list(sims_byname['Dominion'].sort_values(ascending=False)[1:15].index), 'D_Sim':list(sims_byname['Dominion'].sort_values(ascending=False)[1:15]),
            'Gloomhaven':list(sims_byname['Gloomhaven'].sort_values(ascending=False)[1:15].index), 'G_Sim':list(sims_byname['Gloomhaven'].sort_values(ascending=False)[1:15]),
            'Pandemic':list(sims_byname['Pandemic'].sort_values(ascending=False)[1:15].index), 'Pa_Sim':list(sims_byname['Pandemic'].sort_values(ascending=False)[1:15]),
            'Splendor':list(sims_byname['Splendor'].sort_values(ascending=False)[1:15].index), 'Sp_Sim':list(sims_byname['Splendor'].sort_values(ascending=False)[1:15]),
            'Viticulture Essential Edition':list(sims_byname['Viticulture Essential Edition'].sort_values(ascending=False)[1:15].index), 'V_Sim':list(sims_byname['Viticulture Essential Edition'].sort_values(ascending=False)[1:15]),
            'Agricola':list(sims_byname['Agricola'].sort_values(ascending=False)[1:15].index), 'Ag_Sim':list(sims_byname['Agricola'].sort_values(ascending=False)[1:15]),
            'Homesteaders':list(sims_byname['Homesteaders'].sort_values(ascending=False)[1:15].index), 'H_Sim':list(sims_byname['Homesteaders'].sort_values(ascending=False)[1:15]),
            'Puerto Rico':list(sims_byname['Puerto Rico'].sort_values(ascending=False)[1:15].index), 'Pu_Sim':list(sims_byname['Puerto Rico'].sort_values(ascending=False)[1:15]),
            'Chess':list(sims_byname['Chess'].sort_values(ascending=False)[1:15].index), 'Ch_Sim':list(sims_byname['Chess'].sort_values(ascending=False)[1:15]),
            'Backgammon':list(sims_byname['Backgammon'].sort_values(ascending=False)[1:15].index), 'B_Sim':list(sims_byname['Backgammon'].sort_values(ascending=False)[1:15]),
            'Sagrada':list(sims_byname['Sagrada'].sort_values(ascending=False)[1:15].index), 'Sa_Sim':list(sims_byname['Sagrada'].sort_values(ascending=False)[1:15]),
            'Azul':list(sims_byname['Azul'].sort_values(ascending=False)[1:15].index), 'Az_Sim':list(sims_byname['Azul'].sort_values(ascending=False)[1:15]),
            'Codenames':list(sims_byname['Codenames'].sort_values(ascending=False)[1:15].index), 'Co_Sim':list(sims_byname['Codenames'].sort_values(ascending=False)[1:15]),
            'Secret Hitler':list(sims_byname['Secret Hitler'].sort_values(ascending=False)[1:15].index), 'Se_Sim':list(sims_byname['Secret Hitler'].sort_values(ascending=False)[1:15]),
            'Monopoly':list(sims_byname['Monopoly'].sort_values(ascending=False)[1:15].index), 'M_Sim':list(sims_byname['Monopoly'].sort_values(ascending=False)[1:15]), 
            'Lords of Waterdeep':list(sims_byname['Lords of Waterdeep'].sort_values(ascending=False)[1:15].index), 'L_Sim':list(sims_byname['Lords of Waterdeep'].sort_values(ascending=False)[1:15]),
            'Stone Age':list(sims_byname['Stone Age'].sort_values(ascending=False)[1:15].index), 'St_Sim':list(sims_byname['Stone Age'].sort_values(ascending=False)[1:15]),
            'Century: Spice Road':list(sims_byname['Century: Spice Road'].sort_values(ascending=False)[1:15].index), 'Ce_Sim':list(sims_byname['Century: Spice Road'].sort_values(ascending=False)[1:15]),
            'Scrabble':list(sims_byname['Scrabble'].sort_values(ascending=False)[1:15].index), 'Sc_Sim':list(sims_byname['Scrabble'].sort_values(ascending=False)[1:15])
            }

pd.DataFrame(test_dict)

In [None]:
game = 'Caylus'

results = pd.DataFrame(data={'Similarity': sims_byname[game].sort_values(ascending=False)[1:]})
results.head(30)

Dominion, Gloomhaven, Pandemic, Splendor, Viticulture Essential Edition, Agricola, Secret Hitler, Codenames, Azul, Sagrada, Homesteaders, Puerto Rico, Chess, Backgammon, Monopoly, Lords of Waterdeep, Stone Age, Century: Spice Road

In [None]:
break

## Up Next

With things tuned we do this:

* Load in the user matrix
* for each user,
    * get the user's average rating
    * for each item that the user has rated, get the full list of comps with similarities
    
            
    

## Load User Matrix

In [None]:
user_matrix = user_user_filter = pd.read_pickle('data_cleaned/ratings_matrix.pkl')

In [None]:
user_matrix.shape

In [None]:
# make a list of items with more than 30 ratings
sums = pd.DataFrame(user_matrix.count()>=30)

# get indices for the columns with more than 30 ratings
keep_these = sums.loc[sums[0]==True].index

smaller_matrix = user_matrix[keep_these]

In [None]:
smaller_matrix.shape

In [None]:
# make a list of users with fewer than 100 ratings
sums = pd.DataFrame(smaller_matrix.count(axis=1)>=100)

# get indices for the columns with fewer than 100 ratings
drop_these = sums.loc[sums[0]==False].index

# drop the columns with fewer than 100 ratings
smaller_matrix.drop(drop_these, axis=0, inplace=True)

In [None]:
smaller_matrix.shape

In [None]:
del user_matrix
gc.collect()

In [None]:
smaller_matrix = smaller_matrix[:10]
smaller_matrix = smaller_matrix.T
smaller_matrix = smaller_matrix[:100]
smaller_matrix

In [None]:
user_dictionary = {}

for user in smaller_matrix.columns:
    
    user_mean = smaller_matrix[user].mean()
    game_ratings_normed = list(smaller_matrix.loc[smaller_matrix[user].notna()][user] - user_mean)
    game_ids = list(smaller_matrix.loc[smaller_matrix[user].notna()][user].index)
    
    user_game_ratings = {}
    
    for key, value in zip(game_ids, game_ratings_normed):
        user_game_ratings[key] = value
    
    overall_user = {}
    
    overall_user['Mean'] = user_mean
    overall_user['Ratings'] = user_game_ratings
    
    user_dictionary[user] = overall_user

In [None]:
user_dictionary['-Johnny-']

In [None]:
user_df = pd.DataFrame(user_dictionary['-Johnny-']['Ratings'].values(), index=user_dictionary['-Johnny-']['Ratings'].keys(), columns=['Rating'])
user_df

In [None]:
user_dictionary['-Johnny-']['Ratings']['21241']

In [None]:
len(user_dictionary['-Johnny-']['Ratings'])

In [None]:
list(sims_byid[21241].sort_values(ascending=False)[1:21])[0]

In [None]:
for item in user_dictionary['-Johnny-']['Ratings'].keys():
    
    item_int = int(item)
    
    comps_index = list(sims_byid[item_int].sort_values(ascending=False)[1:21].index)
    comps_similarity = list(sims_byid[item_int].sort_values(ascending=False)[1:21])
    
    for position, item in enumerate(comps_index):
    
        if item in user_dictionary['-Johnny-']['Ratings'].keys():
            print("Top comp is already rated, trying again")
            continue
        else:
            print("Computing comp")
            similarity = comps_similarity[position]
            print(similarity)
            print(user_dictionary['-Johnny-']['Ratings'][item])
        
        

In [None]:
sims_byid[68448].sort_values(ascending=False)[1:21]

In [None]:
comps = {}

for game in rated_items_sample:
    
    game = int(game)
    
    this_comps = list(sims_byid[game].sort_values(ascending=False)[1:11].index)
    this_similarities = list(sims_byid[game].sort_values(ascending=False)[1:11])
    
    for key, value in zip(this_comps, this_similarities):
        
        comps[key] = value

In [None]:
pd.DataFrame(comps.values(), index=comps.keys()).sort_values(0, ascending=False).drop_duplicates(keep='first')

### CLEAR VARIABLES

In [None]:
del mechanics
del designers
del publishers
del artists
del awards
del games
del scaled_games
del scaled_mechanics
del scaled_families
del scaled_designers
del scaled_publishers
del scaled_artists
del scaled_awards
del master_games_list
del game_families

gc.collect()