# User Based Recommender Notebook
After downloading all of the user rating CSVs from Google Cloud, I will be merging them into one data frame and making sure everthing lines up correctly. Once everything is ready, I will begin making the initial version of the recommender and then improving on it from there.

In [14]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

## Generating the User Ratings data frame

In [2]:
games = pd.read_csv('../data/bg_top1000.csv', index_col='rank')
game_list = list(games['id'])
game_list

[174430,
 161936,
 224517,
 167791,
 291457,
 233078,
 220308,
 187645,
 162886,
 182028,
 115746,
 193738,
 12333,
 169786,
 316554,
 84876,
 167355,
 173346,
 124361,
 28720,
 120677,
 177736,
 266192,
 205637,
 183394,
 237182,
 164928,
 199792,
 266507,
 96848,
 312484,
 246900,
 175914,
 3076,
 102794,
 170216,
 285774,
 192135,
 31260,
 251247,
 276025,
 221107,
 247763,
 205059,
 256960,
 284083,
 185343,
 126163,
 2651,
 55690,
 216132,
 164153,
 184267,
 209010,
 35677,
 180263,
 244521,
 125153,
 521,
 161533,
 230802,
 72125,
 25613,
 266810,
 191189,
 124742,
 342942,
 28143,
 314040,
 201808,
 159675,
 121921,
 229853,
 171623,
 157354,
 68448,
 200680,
 110327,
 62219,
 182874,
 236457,
 264220,
 122515,
 93,
 18602,
 37111,
 324856,
 12493,
 73439,
 40834,
 269385,
 146021,
 170042,
 172386,
 203993,
 205896,
 281259,
 163412,
 144733,
 42,
 225694,
 102680,
 295947,
 155821,
 284378,
 178900,
 132531,
 36218,
 233371,
 172287,
 263918,
 30549,
 218417,
 196340,
 198928,

In [41]:
dfs = []
for csv in game_list[:1000]:
    df_game = pd.read_csv('../data/board_game_individual_reviews/'+str(csv)+'_ratings.csv').drop(columns='Unnamed: 0')
    dfs.append(df_game)
all_users = pd.concat(dfs)
all_users.reset_index(drop=True, inplace=True)

In [42]:
all_users

Unnamed: 0,user_id,rating,game_id
0,clarkkent22,10.0,174430
1,Garroc,10.0,174430
2,Neva Kee,10.0,174430
3,Chris Coyote,10.0,174430
4,Beaushek,10.0,174430
...,...,...,...
11170192,realjw,1.0,11971
11170193,Watchtowerman1994,1.0,11971
11170194,cryptosha,1.0,11971
11170195,Noldor1998,1.0,11971


In [43]:
all_users.isna().sum()

user_id    30
rating      0
game_id     0
dtype: int64

In [44]:
all_users[all_users['user_id'].isnull()].sort_values('game_id')

Unnamed: 0,user_id,rating,game_id
5167951,,4.0,3
8073837,,5.0,11
7671573,,6.0,13
10004618,,4.0,41
9875721,,2.0,45
6450364,,4.0,50
5738888,,4.0,54
9147549,,6.0,432
3940159,,4.0,463
7500965,,6.0,478


In [45]:
all_users.shape

(11170197, 3)

There are only 30 null values, all in user_id. None of the boardgames have multiple nulls, so I checked to see if there 'null', 'na', 'nan', or blank in the actual csv. There is a 'null' listed, meaning it is possible that the person's username is simply 'null', which pandas is reading as a NaN. Since we can't know for sure, and 30 out of 11,170,197 is a drop in the bucket, I am just going to drop the null columns.

In [46]:
all_users.dropna(inplace=True)

In [47]:
all_users.shape

(11170167, 3)

Now that I have the data frame, I am going to convert the usernames to numeric using a dictionary. This is to make the data anonymous and shorten many of the usernames to save on memory.

In [48]:
user_list = list(all_users['user_id'].unique())
user_list

['clarkkent22',
 'Garroc',
 'Neva Kee',
 'Chris Coyote',
 'Beaushek',
 'ravenpolar',
 'brenmcgovern',
 'grovermerc',
 'Ronnyknox',
 'webs05',
 'Iguloy',
 'Slyght',
 'Leonce',
 'rlphay',
 'testicleez',
 'crwills',
 'WeeGee',
 'Azil3',
 'KimchiTurtle',
 'robbin 1',
 'mcscowl',
 'kittenhoarder',
 'Kitarja',
 'Shampoo4you',
 'Blood Demon',
 'stevelabny',
 'vardamir',
 'Zottelmonster',
 'EdmundBlackadder',
 'gimmster',
 'Clinton',
 'nancynm',
 'mergryphon',
 'darcypennell',
 'snorman',
 'Quotho',
 'Kha Od Dro',
 'davidcoleman',
 'valpop',
 'camidon',
 'repairmanjack',
 'Violet_Iris',
 'tilde72',
 'bwarner34',
 'olafpkyou',
 'thesumo5',
 'Tarkan',
 'Electric421',
 'Husky Seahawk',
 'j41c',
 'adjacentbeastman',
 'geobaldi5',
 'Firetracker',
 'Sassycat',
 'MountainRoot',
 'dpbernath',
 'Kyellan',
 'Suit Sage',
 'bwt2q',
 'oskarkapsel',
 'Morthai',
 'Akrimas',
 'Vex85',
 'blakdeth',
 'opticode',
 'ReinhartTR',
 'bandersnatch1110',
 'Anemelos',
 'Ghanaka',
 'sonofachung',
 'Murdockus',
 'sighlan

In [49]:
user_dict = {}
for i in range(len(users_converted)):
    user_dict[users_converted[i]] = i+1
user_dict
# If I need to look up a username, I can flip the keys and values and look it up by number

{'clarkkent22': 1,
 'Garroc': 2,
 'Neva Kee': 3,
 'Chris Coyote': 4,
 'Beaushek': 5,
 'ravenpolar': 6,
 'brenmcgovern': 7,
 'grovermerc': 8,
 'Ronnyknox': 9,
 'webs05': 10,
 'Iguloy': 11,
 'Slyght': 12,
 'Leonce': 13,
 'rlphay': 14,
 'testicleez': 15,
 'crwills': 16,
 'WeeGee': 17,
 'Azil3': 18,
 'KimchiTurtle': 19,
 'robbin 1': 20,
 'mcscowl': 21,
 'kittenhoarder': 22,
 'Kitarja': 23,
 'Shampoo4you': 24,
 'Blood Demon': 25,
 'stevelabny': 26,
 'vardamir': 27,
 'Zottelmonster': 28,
 'EdmundBlackadder': 29,
 'gimmster': 30,
 'Clinton': 31,
 'nancynm': 32,
 'mergryphon': 33,
 'darcypennell': 34,
 'snorman': 35,
 'Quotho': 36,
 'Kha Od Dro': 37,
 'davidcoleman': 38,
 'valpop': 39,
 'camidon': 40,
 'repairmanjack': 41,
 'Violet_Iris': 42,
 'tilde72': 43,
 'bwarner34': 44,
 'olafpkyou': 45,
 'thesumo5': 46,
 'Tarkan': 47,
 'Electric421': 48,
 'Husky Seahawk': 49,
 'j41c': 50,
 'adjacentbeastman': 51,
 'geobaldi5': 52,
 'Firetracker': 53,
 'Sassycat': 54,
 'MountainRoot': 55,
 'dpbernath': 5

In [50]:
all_users['user_id'] = all_users['user_id'].map(user_dict)

In [51]:
all_users

Unnamed: 0,user_id,rating,game_id
0,1,10.0,174430
1,2,10.0,174430
2,3,10.0,174430
3,4,10.0,174430
4,5,10.0,174430
...,...,...,...
11170192,202730,1.0,11971
11170193,21095,1.0,11971
11170194,45184,1.0,11971
11170195,77129,1.0,11971


In [53]:
games

Unnamed: 0_level_0,title,id
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Gloomhaven,174430
2,Pandemic Legacy: Season 1,161936
3,Brass: Birmingham,224517
4,Terraforming Mars,167791
5,Gloomhaven: Jaws of the Lion,291457
...,...,...
996,1812: The Invasion of Canada,94246
997,Chimera Station,163642
998,Sons of Anarchy: Men of Mayhem,156091
999,Shadow Hunters,24068


In [56]:
user_reviews = pd.merge(games, all_users, left_on='id', right_on='game_id').drop(columns='game_id')

In [60]:
user_reviews

Unnamed: 0,title,id,user_id,rating
0,Gloomhaven,174430,1,10.0
1,Gloomhaven,174430,2,10.0
2,Gloomhaven,174430,3,10.0
3,Gloomhaven,174430,4,10.0
4,Gloomhaven,174430,5,10.0
...,...,...,...,...
11170162,Cockroach Poker,11971,202730,1.0
11170163,Cockroach Poker,11971,21095,1.0
11170164,Cockroach Poker,11971,45184,1.0
11170165,Cockroach Poker,11971,77129,1.0


In [57]:
user_reviews.to_csv('../data/all_user_reviews.csv', index=False)

Confirming the CSV reloads with all of the data

In [58]:
user_reviews = pd.read_csv('../data/all_user_reviews.csv')

In [61]:
user_reviews

Unnamed: 0,title,id,user_id,rating
0,Gloomhaven,174430,1,10.0
1,Gloomhaven,174430,2,10.0
2,Gloomhaven,174430,3,10.0
3,Gloomhaven,174430,4,10.0
4,Gloomhaven,174430,5,10.0
...,...,...,...,...
11170162,Cockroach Poker,11971,202730,1.0
11170163,Cockroach Poker,11971,21095,1.0
11170164,Cockroach Poker,11971,45184,1.0
11170165,Cockroach Poker,11971,77129,1.0


## Creating the Sparse Matrix

In [62]:
pivot = user_reviews.pivot_table(values='rating', index='title', columns='user_id')
pivot.head(3)

user_id,1,2,3,4,5,6,7,8,9,10,...,369309,369310,369311,369312,369313,369314,369315,369316,369317,369318
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13 Days: The Cuban Missile Crisis,,,,,,,,,,8.0,...,,,,,,,,,,
1775: Rebellion,,,,,,,,,,,...,,,,,,,,,,
1812: The Invasion of Canada,,,,,,,,,,,...,,,,,,,,,,


In [63]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [79]:
similarities = cosine_similarity(pivot_sparse)
similarities

array([[1.        , 0.18433832, 0.12715531, ..., 0.04591301, 0.10277826,
        0.07798308],
       [0.18433832, 1.        , 0.32914166, ..., 0.03577458, 0.11504694,
        0.07572326],
       [0.12715531, 0.32914166, 1.        , ..., 0.02658886, 0.12387175,
        0.0912043 ],
       ...,
       [0.04591301, 0.03577458, 0.02658886, ..., 1.        , 0.06988396,
        0.03474348],
       [0.10277826, 0.11504694, 0.12387175, ..., 0.06988396, 1.        ,
        0.18231853],
       [0.07798308, 0.07572326, 0.0912043 , ..., 0.03474348, 0.18231853,
        1.        ]])

In [80]:
recommender = pd.DataFrame(similarities, index=pivot.index, columns=pivot.index)
recommender.head()

title,13 Days: The Cuban Missile Crisis,1775: Rebellion,1812: The Invasion of Canada,1830: Railways & Robber Barons,1846: The Race for the Midwest,1960: The Making of the President,1989: Dawn of Freedom,5-Minute Dungeon,51st State: Master Set,6 nimmt!,...,ZhanGuo,Zombicide,Zombicide Season 2: Prison Outbreak,Zombicide Season 3: Rue Morgue,Zombicide: Black Plague,Zombicide: Green Horde,Zombicide: Invader,Zombie Kidz Evolution,Zooloretto,ZÈRTZ
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13 Days: The Cuban Missile Crisis,1.0,0.184338,0.127155,0.100436,0.122951,0.193931,0.164322,0.054591,0.111366,0.136089,...,0.11754,0.078694,0.042705,0.033217,0.070582,0.032711,0.025349,0.045913,0.102778,0.077983
1775: Rebellion,0.184338,1.0,0.329142,0.103055,0.112302,0.183761,0.148974,0.038763,0.090142,0.104827,...,0.103244,0.091013,0.047707,0.034226,0.06643,0.029193,0.024627,0.035775,0.115047,0.075723
1812: The Invasion of Canada,0.127155,0.329142,1.0,0.102559,0.088857,0.166918,0.160656,0.030154,0.052901,0.098127,...,0.080917,0.081118,0.044655,0.02954,0.050581,0.022436,0.022589,0.026589,0.123872,0.091204
1830: Railways & Robber Barons,0.100436,0.103055,0.102559,1.0,0.383197,0.178309,0.13128,0.027402,0.056155,0.143993,...,0.132375,0.063204,0.03165,0.022634,0.041803,0.018255,0.015949,0.022901,0.132606,0.117823
1846: The Race for the Midwest,0.122951,0.112302,0.088857,0.383197,1.0,0.133292,0.100791,0.028395,0.059622,0.110412,...,0.127282,0.043289,0.019702,0.017936,0.037497,0.016936,0.013754,0.023662,0.084733,0.083904
