In [18]:
import torch, torchtext, numpy as np
import pandas as pd, csv
from torch import nn, optim
from tqdm.auto import tqdm
import random
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
df = pd.read_csv('drive/MyDrive/user_ratings_aug.csv')

First we find the list of unique gameids in the user_ratings dataset and mask the entries in the games_info dataset

In [5]:
unique_list = df['gameid'].unique().tolist()
len(unique_list)

953

In [6]:
df_g = pd.read_csv('drive/MyDrive/games_info.csv')
df_g = df_g.iloc[unique_list]

The following three code bolcks are used for data augmentation:

The users who have less than two ratings recorded in our dataset is assigned new ratings record by giving the same ratings to some random similar games.

In [17]:
def find_similar(gameid, df, num_sims):
  row_ = df[df['index'] == gameid]
  genre = row_['genres'].values[0]
  genre = genre.split('|')
  random.shuffle(genre)
  developer = row_['developer'].values[0]
  developer = developer.split('|')
  random.shuffle(developer)
  publisher = row_['publisher'].values[0]
  publisher = publisher.split('|')
  random.shuffle(publisher)
  df = df.drop(gameid, axis=0)
  df = df[df['genres'].str.match(genre[0]) | df['developer'].str.match(developer[0]) | df['publisher'].str.match(publisher[0])]
  ids = []
  if len(df) >= num_sims:
    rand = random.sample(range(0, len(df)), num_sims)
    ids = [df.iloc[i]['index'] for i in rand]
  return ids

In [9]:
counts = df['username'].value_counts().to_list()
index = 0
for i in range(len(counts)):
  if counts[i] == 2:
    index = i
    break
counts = df['username'].value_counts().index.to_list()
low_user = counts[index:]

In [44]:
ratings = df.values.tolist()
for s in low_user:
  row_ = df[df['username'] == s]
  gameid = row_.values[0][0]
  rating = row_.values[0][2]
  similar = find_similar(gameid, df_g, 2)
  for i in similar:
    ratings.append([i, s, rating])
len(ratings)

203117

In [None]:
df = pd.DataFrame(ratings, columns=['gameid', 'username', 'rating'])
df.to_csv('drive/MyDrive/user_ratings_aug.csv', index=False)

The following code blocks are for building and training the model. Part of this section are borrowed from the "recommender-system" notebook used in the lecture

In [10]:
class GameDataset(torch.utils.data.Dataset):
  def __init__(self, fn):
    df = pd.read_csv(fn)
    u2n = { u: n for n, u in enumerate(df['username'].unique()) }
    g2n = { g: n for n, g in enumerate(df['gameid'].unique()) }
    df['username'] = df['username'].apply(lambda u: u2n[u])
    df['gameid'] = df['gameid'].apply(lambda g: g2n[g])
    self.coords = torch.LongTensor(df[['username','gameid']].values)
    self.ratings = torch.FloatTensor(df['rating'].values)
    self.n_users = df['username'].nunique()
    self.n_games = df['gameid'].nunique()

  def __len__(self):
      return len(self.coords)

  def __getitem__(self, i):
      return (self.coords[i], self.ratings[i])

In [11]:
ds_full = GameDataset('drive/MyDrive/user_ratings_aug.csv')
n_train = int(0.99 * len(ds_full))
n_test = len(ds_full) - n_train
ds_train, ds_test = torch.utils.data.random_split(ds_full, [n_train, n_test])

In [14]:
class GameRecs(nn.Module):
  def __init__(self, n_users, n_games, emb_dim):
    super(GameRecs, self).__init__()
    self.user_emb = nn.Embedding(n_users, emb_dim)
    self.game_emb = nn.Embedding(n_games, emb_dim)
    nn.init.xavier_uniform_(self.user_emb.weight)
    nn.init.xavier_uniform_(self.game_emb.weight)
  
  def forward(self, samples):
    users = self.user_emb(samples[:,0])
    games = self.game_emb(samples[:,1])
    return (users * games).sum(1)

In [15]:
class GameRecsBias(nn.Module):
  def __init__(self, n_users, n_games, emb_dim):
    super(GameRecsBias, self).__init__()
    self.user_emb = nn.Embedding(n_users, emb_dim)
    self.user_bias = nn.Embedding(n_users, 1)
    self.game_emb = nn.Embedding(n_games, emb_dim)
    self.game_bias = nn.Embedding(n_games, 1)
    nn.init.xavier_uniform_(self.user_emb.weight)
    nn.init.xavier_uniform_(self.game_emb.weight)
    nn.init.zeros_(self.user_bias.weight)
    nn.init.zeros_(self.game_bias.weight)
    
  def forward(self, samples):
    users = self.user_emb(samples[:,0])
    games = self.game_emb(samples[:,1])
    dot = (users * games).sum(1)
    user_b = self.user_bias(samples[:,0]).squeeze()
    game_b = self.game_bias(samples[:,1]).squeeze()
    return dot + user_b + game_b

In [16]:
device = torch.device('cpu')

def run_test(model, ldr, crit):
  total_loss, total_count = 0, 0
  model.eval()
  tq_iters = tqdm(ldr, leave=False, desc='test iter')
  with torch.no_grad():
    for coords, labels in tq_iters:
      coords, labels = coords.to(device), labels.to(device)
      preds = model(coords)
      loss = crit(preds, labels)
      total_loss += loss.item() * labels.size(0)
      total_count += labels.size(0)
      tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
  return total_loss / total_count

def run_train(model, ldr, crit, opt, sched):
  model.train()
  total_loss, total_count = 0, 0
  tq_iters = tqdm(ldr, leave=False, desc='train iter')
  for (coords, labels) in tq_iters:
    opt.zero_grad()
    coords, labels = coords.to(device), labels.to(device)
    preds = model(coords)
    loss = crit(preds, labels)
    loss.backward()
    opt.step()
    sched.step()
    total_loss += loss.item() * labels.size(0)
    total_count += labels.size(0)
    tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
  return total_loss / total_count

def run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs=10):
  best_loss = np.inf
  tq_epochs = tqdm(range(n_epochs), desc='epochs', unit='ep')
  for epoch in tq_epochs:
    train_loss = run_train(model, ldr_train, crit, opt, sched)
    test_loss = run_test(model, ldr_test, crit)
    tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}    test loss {test_loss:.6f}')
    if test_loss < best_loss:
      best_loss = test_loss
      tq_epochs.set_postfix({'bE': epoch, 'bL': best_loss}, refresh=True)

This is the record for our best run. Initially, we tried to train the model without data augmentaion, and got a result of around 30 for test loss. Then we tried data augmentation and changing parameters. Finially a test loss of around 2 is achieved. This has proved that data augmentation is very effective for our case, and further augmentation on the data will be explored carefully in the future to improve accuracy. The model with bias is also tested, but the result is worse than the one without bias, so we just used the model without bias in further steps

In [17]:
model = GameRecs(ds_full.n_users, ds_full.n_games, 30)
model.to(device)

ldr_train = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
ldr_test = torch.utils.data.DataLoader(ds_test, batch_size=32)

n_epochs = 10

crit = nn.MSELoss().to(device)
opt = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=0.1, steps_per_epoch=len(ldr_train), epochs=n_epochs)

run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=10.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 0   train loss 56.179903    test loss 53.493100


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 1   train loss 24.267217    test loss 5.068507


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 2   train loss 2.950438    test loss 3.153293


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 3   train loss 1.503987    test loss 2.554057


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 4   train loss 0.798146    test loss 2.327393


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 5   train loss 0.405337    test loss 2.173054


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 6   train loss 0.199122    test loss 2.118407


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 7   train loss 0.094347    test loss 2.089153


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 8   train loss 0.048283    test loss 2.081060


HBox(children=(FloatProgress(value=0.0, description='train iter', max=6284.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=64.0, style=ProgressStyle(description_wid…

epoch 9   train loss 0.032263    test loss 2.078532



Save the model from best run to a binary file:

In [19]:
pickle.dump(model, open("drive/MyDrive/model_param.pickle", "wb"))

The following code blocks are all for testing the model in the way that we are actully going to integrate it in our project.

Our app will get up to three games as inputs from the user. So we first test the model with a list of three gameids:

In [26]:
class GameDatasetTest(torch.utils.data.Dataset):
  def __init__(self, df):
    self.coords = torch.LongTensor(df[['username','gameid']].values)
    self.ratings = torch.FloatTensor(df['rating'].values)
    self.n_users = df['username'].nunique()
    self.n_games = df['gameid'].nunique()

  def __len__(self):
      return len(self.coords)

  def __getitem__(self, i):
      return (self.coords[i], self.ratings[i])

In [52]:
def get_recs_user(gameids, model, games_list, num_recs):
  index = [games_list.index(i) for i in gameids]
  model.user_emb = nn.Embedding(1, 30)
  nn.init.xavier_uniform_(model.user_emb.weight)
  model.game_emb.weight.requires_grad = False
  data = []
  for i in index:
    data.append([i, 0, 10])
  df1 = pd.DataFrame(data, columns=['gameid','username','rating'])
  ds = GameDatasetTest(df1)
  ldr = torch.utils.data.DataLoader(ds, batch_size=1)
  opt = optim.SGD([param for param in model.parameters() if param.requires_grad == True], lr=1e-6, momentum=0.9)
  crit = nn.MSELoss().to(device)
  sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=1e-4, steps_per_epoch=len(ldr), epochs=50)
  model.to(device)
  for i in range(50):
    run_train(model, ldr, crit, opt, sched)
  preds = []
  for i in range(len(games_list)):
    if i not in index:
      product = model(torch.LongTensor([[0,i]]))
      preds.append([i, product])
  def sortFunc(p):
    return p[1]
  preds.sort(reverse=True, key=sortFunc)
  gameids = []
  for i in range(num_recs):
    gameid = games_list[preds[i][0]]
    gameids.append(gameid)
  return gameids

In [47]:
get_recs_user([136,137,141], model, unique_list, 5)

HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



HBox(children=(FloatProgress(value=0.0, description='train iter', max=3.0, style=ProgressStyle(description_wid…



[1338, 1128, 134, 146, 451]

The inputs are: Assassin's Creed III Remastered, Assassin's Creed IV: Black Flag, and Assassin's Creed Rogue Remastered

The ouput of our model are: The Inpatient, Saints Row: Gat Out of Hell, Assassin's Creed Freedom Cry, Assassin's Creed: Revelations, Dynasty Warriors 9

The results include 2 other games from the assasin's creed series, and all the other games have some similar attributes as the input games, same as open-world genre. This suggests that our model is producing reasonable results for some inputs.

The model is also tested with item-to-item recommending by comparing the cosine similarity of all the entries in the games embedding, given a game as input:

In [25]:
def get_recs_game(gameid, model, games_list, num_recs):
  index = games_list.index(gameid)
  user_pref = model.game_emb.weight[index]
  cos = nn.CosineSimilarity(dim = 0)
  sims = []
  for i in range(len(games_list)):
    if i != index:
      game_param = model.game_emb.weight[i]
      similarity = cos(user_pref, game_param)
      sims.append([i, similarity])
  def sortFunc(p):
    return p[1]
  sims.sort(reverse=True, key=sortFunc)
  gameids = []
  for i in range(num_recs):
    gameid = games_list[sims[i][0]]
    gameids.append(gameid)
  return gameids

In [60]:
get_recs_game(1432, model, unique_list, 5)

[601, 141, 1433, 747, 598]

The input is Tom Clancy's The Division, and the outputs are Grow Home, Assassin's Creed Rogue Remastered, Tom Clancy's The Division 2, Knot, Grim Legends: The Forsaken Bride

The first three recommendations are reasonable, since they are all from Ubisoft and even another game in the same series is included. However the last two are not so similar to the input. We'll look into solving this problem by further augmenting the dataset or exploring fancier methods for deciding what to actually recommend