In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
# Need to upload "boardgames.csv" to files every session.
file_path = ('boardgames.csv')
boardgame_df = pd.read_csv(file_path)
# Unicode Problem on row 24
boardgame_df.head(5)

Unnamed: 0,objectid,name,average,avgweight,boardgamecategory,boardgamemechanic
0,174430,Gloomhaven,8.85292,3.8078,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Campaign / Battle Card Driven', 'Cooperative..."
1,161936,Pandemic Legacy Season 1,8.62499,2.8301,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma..."
2,167791,Terraforming Mars,8.42299,3.2313,"['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma..."
3,182028,Through the Ages A New Story of Civilization,8.49419,4.385,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:..."
4,224517,Brass Birmingham,8.62031,3.9122,"['Economic', 'Industry / Manufacturing', 'Tran...","['Hand Management', 'Income', 'Loans', 'Market..."


In [3]:
# Our decoding
decode_lambda = lambda x: bytearray(x, 'utf-8').decode('unicode-escape')
# Applying the decoding to the column, ignore errors.
boardgame_df['name'] = boardgame_df['name'].apply(lambda x: decode_lambda(x))
# Unicode replaced with proper characters
boardgame_df.head(5)

  
  
  
  
  
  
  


Unnamed: 0,objectid,name,average,avgweight,boardgamecategory,boardgamemechanic
0,174430,Gloomhaven,8.85292,3.8078,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Campaign / Battle Card Driven', 'Cooperative..."
1,161936,Pandemic Legacy Season 1,8.62499,2.8301,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma..."
2,167791,Terraforming Mars,8.42299,3.2313,"['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma..."
3,182028,Through the Ages A New Story of Civilization,8.49419,4.385,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:..."
4,224517,Brass Birmingham,8.62031,3.9122,"['Economic', 'Industry / Manufacturing', 'Tran...","['Hand Management', 'Income', 'Loans', 'Market..."


In [4]:
boardgame_df = boardgame_df.rename(columns={'objectid': 'ID', 'name': 'Name', 'average': 'Avg Rating', 'avgweight': 'Complexity', 'boardgamecategory': 'Category', 'boardgamemechanic': 'Mechanic'})

# Other languages are supported by the decoding: Row 70
boardgame_df.head(5)

Unnamed: 0,ID,Name,Avg Rating,Complexity,Category,Mechanic
0,174430,Gloomhaven,8.85292,3.8078,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Campaign / Battle Card Driven', 'Cooperative..."
1,161936,Pandemic Legacy Season 1,8.62499,2.8301,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma..."
2,167791,Terraforming Mars,8.42299,3.2313,"['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma..."
3,182028,Through the Ages A New Story of Civilization,8.49419,4.385,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:..."
4,224517,Brass Birmingham,8.62031,3.9122,"['Economic', 'Industry / Manufacturing', 'Tran...","['Hand Management', 'Income', 'Loans', 'Market..."


In [5]:
# Create new dataframe with columns that do not need dummy variables

boardgame_trunc = boardgame_df.drop(["Category","Mechanic"],axis=1)
boardgame_trunc.head()

Unnamed: 0,ID,Name,Avg Rating,Complexity
0,174430,Gloomhaven,8.85292,3.8078
1,161936,Pandemic Legacy Season 1,8.62499,2.8301
2,167791,Terraforming Mars,8.42299,3.2313
3,182028,Through the Ages A New Story of Civilization,8.49419,4.385
4,224517,Brass Birmingham,8.62031,3.9122


In [6]:
# Create new dataframes for Category and Mechanic columns
# Fix formatting in both new dfs - currently set as string, not list (as provided by source)

categories_df = boardgame_df[['ID', 'Category']].copy()
categories_df['Category'] = categories_df['Category'].apply(lambda x: x[1:len(x)-1].split(', '))

mechanics_df = boardgame_df[['ID', 'Mechanic']].copy()
mechanics_df['Mechanic'] = mechanics_df['Mechanic'].apply(lambda x: x[1:len(x)-1].split(', '))

In [7]:
pd.set_option('display.max_rows', 500)

# Explode lists of categories to single level
cat_counts = categories_df.explode('Category')
cat_counts.columns.str.replace("'","")

# Get value counts of each category
cat_vc = cat_counts.groupby("Category").size().sort_values(ascending=False)
cat_vc


Category
'Card Game'                                                                   5686
'Wargame'                                                                     3500
'Fantasy'                                                                     2361
'Party Game'                                                                  1723
'Dice'                                                                        1712
'Fighting'                                                                    1565
'Abstract Strategy'                                                           1541
'Childrens Game'                                                              1521
'Science Fiction'                                                             1495
'Economic'                                                                    1423
'Bluffing'                                                                    1136
'World War II'                                                                

In [8]:
# Identify relevant categories by dropping low count categories
categories_to_keep = cat_vc[cat_vc > 100].index

In [9]:
categories_to_keep

Index([''Card Game'', ''Wargame'', ''Fantasy'', ''Party Game'', ''Dice'',
       ''Fighting'', ''Abstract Strategy'', ''Childrens Game'',
       ''Science Fiction'', ''Economic'', ''Bluffing'', ''World War II'',
       ''Animals'', ''Humor'', ''Deduction'', ''Adventure'',
       ''Action / Dexterity'', ''Movies / TV / Radio theme'', ''Miniatures'',
       ''Medieval'', ''Exploration'', ''Racing'', ''Ancient'', ''Negotiation'',
       ''Real-time'', ''Horror'', ''Nautical'', ''Trivia'', ''Sports'',
       ''Puzzle'', ''Memory'', ''Political'', ''City Building'', ''Word Game'',
       ''Print & Play'', ''Educational'', ''Novel-based'',
       ''Territory Building'', ''Transportation'', ''Collectible Components'',
       ''Modern Warfare'', ''Murder/Mystery'', ''Civilization'', ''Trains'',
       ''Mythology'', ''Pirates'', ''Napoleonic'', ''Comic Book / Strip'',
       ''Video Game Theme'', ''Industry / Manufacturing'',
       ''Aviation / Flight'', ''Renaissance'', ''Expansion for Base-

In [10]:
# Filter to remove irrelevant data

filtered_cat = cat_counts[cat_counts.Category.isin(categories_to_keep)]
filtered_cat

Unnamed: 0,ID,Category
0,174430,'Adventure'
0,174430,'Exploration'
0,174430,'Fantasy'
0,174430,'Fighting'
0,174430,'Miniatures'
...,...,...
19997,5159,'Abstract Strategy'
19997,5159,'Childrens Game'
19998,5160,'Abstract Strategy'
19998,5160,'Childrens Game'


In [11]:
# Check to make sure binning was successful
cat_vc_check = filtered_cat.groupby("Category").size().sort_values(ascending=False)

cat_vc_check

Category
'Card Game'                    5686
'Wargame'                      3500
'Fantasy'                      2361
'Party Game'                   1723
'Dice'                         1712
'Fighting'                     1565
'Abstract Strategy'            1541
'Childrens Game'               1521
'Science Fiction'              1495
'Economic'                     1423
'Bluffing'                     1136
'World War II'                 1129
'Animals'                      1107
'Humor'                        1064
'Deduction'                    1046
'Adventure'                    1017
'Action / Dexterity'            999
'Movies / TV / Radio theme'     980
'Miniatures'                    925
'Medieval'                      888
'Exploration'                   792
'Racing'                        709
'Ancient'                       697
'Negotiation'                   636
'Real-time'                     623
'Horror'                        593
'Nautical'                      575
'Trivia'           

In [12]:
filtered_cat

Unnamed: 0,ID,Category
0,174430,'Adventure'
0,174430,'Exploration'
0,174430,'Fantasy'
0,174430,'Fighting'
0,174430,'Miniatures'
...,...,...
19997,5159,'Abstract Strategy'
19997,5159,'Childrens Game'
19998,5160,'Abstract Strategy'
19998,5160,'Childrens Game'


In [13]:
cat_dummies = pd.get_dummies(filtered_cat, prefix = "", prefix_sep="")
cat_dummies.columns = cat_dummies.columns.str.replace("'","")

cat_final = cat_dummies.groupby("ID").sum()
cat_final

Unnamed: 0_level_0,Abstract Strategy,Action / Dexterity,Adventure,Age of Reason,American Civil War,American West,Ancient,Animals,Aviation / Flight,Bluffing,...,Trains,Transportation,Travel,Trivia,Video Game Theme,Wargame,Word Game,World War I,World War II,Zombies
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292961,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
292962,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
294612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
cat_final.columns

Index(['Abstract Strategy', 'Action / Dexterity', 'Adventure', 'Age of Reason',
       'American Civil War', 'American West', 'Ancient', 'Animals',
       'Aviation / Flight', 'Bluffing', 'Book', 'Card Game', 'Childrens Game',
       'City Building', 'Civilization', 'Collectible Components',
       'Comic Book / Strip', 'Deduction', 'Dice', 'Economic', 'Educational',
       'Electronic', 'Environmental', 'Expansion for Base-game', 'Exploration',
       'Fantasy', 'Farming', 'Fighting', 'Horror', 'Humor',
       'Industry / Manufacturing', 'Mafia', 'Math', 'Mature / Adult', 'Maze',
       'Medieval', 'Memory', 'Miniatures', 'Modern Warfare',
       'Movies / TV / Radio theme', 'Murder/Mystery', 'Mythology',
       'Napoleonic', 'Nautical', 'Negotiation', 'Novel-based', 'Number',
       'Party Game', 'Pirates', 'Political', 'Prehistoric', 'Print & Play',
       'Puzzle', 'Racing', 'Real-time', 'Renaissance', 'Science Fiction',
       'Space Exploration', 'Spies/Secret Agents', 'Sports',


In [15]:
pd.set_option('display.max_rows', 500)

mech_counts = mechanics_df.explode('Mechanic')
mech_vc = mech_counts.groupby("Mechanic").size().sort_values(ascending=False)
mech_vc


Mechanic
'Dice Rolling'                                                       4461
'Hand Management'                                                    3810
'Set Collection'                                                     2532
'Hexagon Grid'                                                       2132
'Variable Player Powers'                                             2071
'Card Drafting'                                                      1571
'Tile Placement'                                                     1559
'Roll / Spin and Move'                                               1526
'Modular Board'                                                      1499
'Area Majority / Influence'                                          1364
'Cooperative Game'                                                   1201
'Simulation'                                                         1088
'Simultaneous Action Selection'                                      1053
'Auction/Bidding'            

In [16]:
mech_types_to_keep = mech_vc[mech_vc > 175].index

# Filter to remove irrelevant data
filtered_mech = mech_counts[mech_counts.Mechanic.isin(mech_types_to_keep)]
filtered_mech

# # Check to make sure binning was successful
mech_vc_check = filtered_mech.groupby("Mechanic").size().sort_values(ascending=False)

mech_dummies = pd.get_dummies(filtered_mech, prefix = "", prefix_sep="")
mech_dummies.columns = mech_dummies.columns.str.replace("'","")

mech_final = mech_dummies.groupby("ID").sum()

In [17]:
mech_vc_check

Mechanic
'Dice Rolling'                     4461
'Hand Management'                  3810
'Set Collection'                   2532
'Hexagon Grid'                     2132
'Variable Player Powers'           2071
'Card Drafting'                    1571
'Tile Placement'                   1559
'Roll / Spin and Move'             1526
'Modular Board'                    1499
'Area Majority / Influence'        1364
'Cooperative Game'                 1201
'Simulation'                       1088
'Simultaneous Action Selection'    1053
'Auction/Bidding'                  1044
'Action Points'                    1011
'Area Movement'                    1001
'Memory'                            848
'Grid Movement'                     811
'Team-Based Game'                   811
'Take That'                         783
'Point to Point Movement'           768
'Push Your Luck'                    720
'Pattern Building'                  701
'Deck'                              631
'Worker Placement'             

In [18]:
# Merge all 3 dataframes with get_dummies columns, add suffixes as needed

dfs_to_merge = [boardgame_trunc, cat_final, mech_final]

boardgame_inter = pd.merge(boardgame_trunc, cat_final, on="ID", how='outer')
boardgame_final = pd.merge(boardgame_inter, mech_final, on="ID", how='outer', suffixes=("_Category","_Mechanic"))

In [19]:
# Drop any rows from final table that don't have any categories or mechanics remaining

boardgame_final.dropna()

Unnamed: 0,ID,Name,Avg Rating,Complexity,Abstract Strategy_category,Action / Dexterity_category,Adventure,Age of Reason,American Civil War,American West,...,Storytelling,Take That,Team-Based Game,Tile Placement,Trading,Trick-taking,Variable Phase Order,Variable Player Powers,Voting,Worker Placement
0,174430,Gloomhaven,8.85292,3.8078,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,161936,Pandemic Legacy Season 1,8.62499,2.8301,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,167791,Terraforming Mars,8.42299,3.2313,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,182028,Through the Ages A New Story of Civilization,8.49419,4.3850,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,224517,Brass Birmingham,8.62031,3.9122,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5154,Franchise,6.00000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19996,5158,Punto y Raya,4.00000,0.0000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,5159,3 Up,6.00000,0.0000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,5160,Dino-Checkers,5.77500,0.0000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
my_list = boardgame_final.columns.values.tolist()
my_list


['ID',
 'Name',
 'Avg Rating',
 'Complexity',
 'Abstract Strategy_category',
 'Action / Dexterity_category',
 'Adventure',
 'Age of Reason',
 'American Civil War',
 'American West',
 'Ancient',
 'Animals',
 'Aviation / Flight',
 'Bluffing',
 'Book',
 'Card Game_category',
 'Childrens Game_category',
 'City Building',
 'Civilization',
 'Collectible Components',
 'Comic Book / Strip',
 'Deduction',
 'Dice',
 'Economic',
 'Educational',
 'Electronic',
 'Environmental',
 'Expansion for Base-game',
 'Exploration',
 'Fantasy',
 'Farming',
 'Fighting',
 'Horror',
 'Humor',
 'Industry / Manufacturing',
 'Mafia',
 'Math',
 'Mature / Adult',
 'Maze',
 'Medieval',
 'Memory_category',
 'Miniatures',
 'Modern Warfare',
 'Movies / TV / Radio theme',
 'Murder/Mystery',
 'Mythology',
 'Napoleonic',
 'Nautical',
 'Negotiation',
 'Novel-based',
 'Number',
 'Party Game_category',
 'Pirates',
 'Political',
 'Prehistoric',
 'Print & Play',
 'Puzzle',
 'Racing',
 'Real-time',
 'Renaissance',
 'Science Ficti

In [34]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

# Load the dataset
reader = Reader(name = "ID", rating_scale=(1,10), )
data = Dataset.load_from_df('boardgame_final')


param_grid = {'n_epochs': [25, 100], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# # define a cross-validation iterator
# kf = KFold(n_splits=3)

# algo = SVD()

# for trainset, testset in kf.split(data):

#     # train and test algorithm.
#     algo.fit(trainset)
#     predictions = algo.test(testset)

#     # Compute and print Root Mean Squared Error
#     accuracy.rmse(predictions, verbose=True)

# # Retrieve the trainset.
# trainset = data.build_full_trainset()

# # Build an algorithm, and train it.
# algo = KNNBasic()
# algo.fit(trainset)

# from surprise import SVD
# from surprise.model_selection import cross_validate
# svd = SVD(verbose=True, n_epochs=10)
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

TypeError: load_from_df() missing 1 required positional argument: 'reader'

In [30]:
# benchmark = []
# # Iterate over all algorithms
# for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
    
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

NameError: name 'SVDpp' is not defined