In [1]:
import torch, torchtext, numpy as np
import pandas as pd, csv
from torch import nn, optim
from tqdm.auto import tqdm
import random
import pickle

In [None]:
torch.manual_seed(0)
np.random.seed(0)
torch.set_deterministic(True)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [122]:
df = pd.read_csv('drive/MyDrive/user_ratings_aug.csv')
len(df)

1501544

First we find the list of unique gameids in the user_ratings dataset and mask the entries in the games_info dataset

In [123]:
unique_list = df['gameid'].unique().tolist()
len(unique_list)

986

In [None]:
input_file = open('list.txt', 'r')
input_list = input_file.readlines()
input_list = [int(s[:-1]) for s in input_list]
len(input_list)

361

In [23]:
df_g = pd.read_csv('drive/MyDrive/games_info.csv')
df_g = df_g.iloc[unique_list]
for i in range(len(df_g)):
  if df_g.iloc[i]['size'][-2:] == 'GB':
    df_g.at[df_g.iloc[i]['index'], 'size'] = float(df_g.iloc[i]['size'][:-2])
  else:
    df_g.at[df_g.iloc[i]['index'], 'size'] = float(df_g.iloc[i]['size'][:-2]) / 1000

The following three code bolcks are used for data augmentation:

The users who have less than two ratings recorded in our dataset is assigned new ratings record by giving the same ratings to some random similar games.

In [91]:
def find_similar(gameid, df, num_sims):
  row_ = df[df['index'] == gameid]
  genre = row_['genres'].values[0]
  genre = genre.split('|')
  random.shuffle(genre)
  developer = row_['developer'].values[0]
  developer = developer.split('|')
  random.shuffle(developer)
  publisher = row_['publisher'].values[0]
  publisher = publisher.split('|')
  random.shuffle(publisher)
  size = row_['size'].values[0]
  df = df.drop(gameid, axis=0)
  df = df[df['genres'].str.contains(genre[0]) | df['developer'].str.contains(developer[0]) | df['publisher'].str.contains(publisher[0])]
  df = df[df['size'] < 5 * size]
  df = df[df['size'] > size / 5]
  ids = []
  if len(df) >= num_sims:
    rand = random.sample(range(0, len(df)), num_sims)
    ids = [df.iloc[i]['index'] for i in rand]
  else:
    for i in range(len(df)):
      ids.append(df.iloc[i]['index'])
  return ids

In [None]:
counts = df['username'].value_counts().to_list()
index = 0
for i in range(len(counts)):
  if counts[i] == 10:
    index = i
    break
counts = df['username'].value_counts().index.to_list()
low_user = counts[index:]

In [None]:
ratings = df.values.tolist()
for s in low_user:
  row_ = df[df['username'] == s]
  randint = random.randrange(len(row_))
  gameid = row_.values[randint][0]
  rating = row_.values[randint][2]
  similar = find_similar(gameid, df_g, 20)
  for i in similar:
    ratings.append([i, s, rating])
len(ratings)

1310889

In [46]:
ratings = []
all_users = df['username'].unique().tolist()
for s in all_users:
  row_ = df[df['username'] == s]
  randint = random.randrange(len(row_))
  gameid = row_.values[randint][0]
  rating = row_.values[randint][2]
  ratings.append([gameid, s, rating])
  similar = find_similar(gameid, df_g, 20)
  for i in similar:
    ratings.append([i, s, rating])
len(ratings)

1287852

In [89]:
counts = df['gameid'].value_counts().to_list()
index = 0
for i in range(len(counts)):
  if counts[i] <= 200:
    index = i
    break
counts = df['gameid'].value_counts().index.to_list()
low_games = counts[index:]
len(low_games)

59

In [87]:
for id in low_games:
  sims = []
  for i in range(10):
    sims.extend(find_similar(id, df_g, 20))
  sims = list(set(sims))
  for sim_id in sims:
    sim_ratings = df[df['gameid'] == sim_id]
    for i in range(min(len(sim_ratings),10)):
      ratings.append([id, sim_ratings.values[i][1], sim_ratings.values[i][2]])
len(ratings)

1506446

In [95]:
df.to_csv('drive/MyDrive/user_ratings_aug.csv', index=False)

The following code blocks are for building and training the model. Part of this section are borrowed from the "recommender-system" notebook used in the lecture

In [124]:
class GameDataset(torch.utils.data.Dataset):
  def __init__(self, fn):
    df = pd.read_csv(fn)
    u2n = { u: n for n, u in enumerate(df['username'].unique()) }
    g2n = { g: n for n, g in enumerate(df['gameid'].unique()) }
    df['username'] = df['username'].apply(lambda u: u2n[u])
    df['gameid'] = df['gameid'].apply(lambda g: g2n[g])
    self.coords = torch.LongTensor(df[['username','gameid']].values)
    self.ratings = torch.FloatTensor(df['rating'].values)
    self.n_users = df['username'].nunique()
    self.n_games = df['gameid'].nunique()

  def __len__(self):
      return len(self.coords)

  def __getitem__(self, i):
      return (self.coords[i], self.ratings[i])

In [125]:
ds_full = GameDataset('drive/MyDrive/user_ratings_aug.csv')
n_train = int(0.95 * len(ds_full))
n_test = len(ds_full) - n_train
ds_train, ds_test = torch.utils.data.random_split(ds_full, [n_train, n_test])

In [None]:
class GameRecs(nn.Module):
  def __init__(self, n_users, n_games, emb_dim):
    super(GameRecs, self).__init__()
    self.user_emb = nn.Embedding(n_users, emb_dim)
    self.game_emb = nn.Embedding(n_games, emb_dim)
    nn.init.xavier_uniform_(self.user_emb.weight)
    nn.init.xavier_uniform_(self.game_emb.weight)
  
  def forward(self, samples):
    users = self.user_emb(samples[:,0])
    games = self.game_emb(samples[:,1])
    return (users * games).sum(1)

In [128]:
class GameRecsBias(nn.Module):
  def __init__(self, n_users, n_games, emb_dim):
    super(GameRecsBias, self).__init__()
    self.user_emb = nn.Embedding(n_users, emb_dim)
    self.user_bias = nn.Embedding(n_users, 1)
    self.game_emb = nn.Embedding(n_games, emb_dim)
    self.game_bias = nn.Embedding(n_games, 1)
    nn.init.xavier_uniform_(self.user_emb.weight)
    nn.init.xavier_uniform_(self.game_emb.weight)
    nn.init.zeros_(self.user_bias.weight)
    nn.init.zeros_(self.game_bias.weight)
    
  def forward(self, samples):
    users = self.user_emb(samples[:,0])
    games = self.game_emb(samples[:,1])
    dot = (users * games).sum(1)
    user_b = self.user_bias(samples[:,0]).squeeze()
    game_b = self.game_bias(samples[:,1]).squeeze()
    return dot + user_b + game_b

In [129]:
device = torch.device('cuda:0')

def run_test(model, ldr, crit):
  total_loss, total_count = 0, 0
  model.eval()
  tq_iters = tqdm(ldr, leave=False, desc='test iter')
  with torch.no_grad():
    for coords, labels in tq_iters:
      coords, labels = coords.to(device), labels.to(device)
      preds = model(coords)
      loss = crit(preds, labels)
      total_loss += loss.item() * labels.size(0)
      total_count += labels.size(0)
      tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
  return total_loss / total_count

def run_train(model, ldr, crit, opt, sched):
  model.train()
  total_loss, total_count = 0, 0
  tq_iters = tqdm(ldr, leave=False, desc='train iter')
  for (coords, labels) in tq_iters:
    opt.zero_grad()
    coords, labels = coords.to(device), labels.to(device)
    preds = model(coords)
    loss = crit(preds, labels)
    loss.backward()
    opt.step()
    sched.step()
    total_loss += loss.item() * labels.size(0)
    total_count += labels.size(0)
    tq_iters.set_postfix({'loss': total_loss/total_count}, refresh=True)
  return total_loss / total_count

def run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs=10):
  best_loss = np.inf
  tq_epochs = tqdm(range(n_epochs), desc='epochs', unit='ep')
  for epoch in tq_epochs:
    train_loss = run_train(model, ldr_train, crit, opt, sched)
    test_loss = run_test(model, ldr_test, crit)
    tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}    test loss {test_loss:.6f}')
    if test_loss < best_loss:
      best_loss = test_loss
      tq_epochs.set_postfix({'bE': epoch, 'bL': best_loss}, refresh=True)

This is the record for our best run. Initially, we tried to train the model without data augmentaion, and got a result of around 30 for test loss. Then we tried data augmentation and changing parameters. Finially a test loss of around 2 is achieved. This has proved that data augmentation is very effective for our case, and further augmentation on the data will be explored carefully in the future to improve accuracy. The model with bias is also tested, but the result is worse than the one without bias, so we just used the model without bias in further steps

In [130]:
model = GameRecsBias(ds_full.n_users, ds_full.n_games, 30)
model.to(device)

ldr_train = torch.utils.data.DataLoader(ds_train, batch_size=32, shuffle=True)
ldr_test = torch.utils.data.DataLoader(ds_test, batch_size=32)

n_epochs = 10

crit = nn.MSELoss().to(device)
opt = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=0.1, steps_per_epoch=len(ldr_train), epochs=n_epochs)

run_all(model, ldr_train, ldr_test, crit, opt, sched, n_epochs)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=10.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 0   train loss 10.949713    test loss 0.561472


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 1   train loss 0.141272    test loss 0.029963


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 2   train loss 0.013138    test loss 0.011166


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 3   train loss 0.005150    test loss 0.006608


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 4   train loss 0.002846    test loss 0.004703


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 5   train loss 0.001752    test loss 0.003772


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 6   train loss 0.001134    test loss 0.003283


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 7   train loss 0.000766    test loss 0.002971


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 8   train loss 0.000558    test loss 0.002817


HBox(children=(FloatProgress(value=0.0, description='train iter', max=44578.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='test iter', max=2347.0, style=ProgressStyle(description_w…

epoch 9   train loss 0.000464    test loss 0.002789



Save the model from best run to a binary file:

In [138]:
pickle.dump(model, open("drive/MyDrive/model_param.pickle", "wb"))

In [112]:
model = pickle.load(open('drive/MyDrive/model_param.pickle', 'rb'))

The following code blocks are all for testing the model in the way that we are actully going to integrate it in our project.

Our app will get up to three games as inputs from the user. So we first test the model with a list of three gameids:

In [100]:
class GameDatasetTest(torch.utils.data.Dataset):
  def __init__(self, df):
    self.coords = torch.LongTensor(df[['username','gameid']].values)
    self.ratings = torch.FloatTensor(df['rating'].values)
    self.n_users = df['username'].nunique()
    self.n_games = df['gameid'].nunique()

  def __len__(self):
      return len(self.coords)

  def __getitem__(self, i):
      return (self.coords[i], self.ratings[i])

In [None]:
def get_recs_user(gameids, model, games_list, num_recs):
  index = [games_list.index(i) for i in gameids]
  model.user_emb = nn.Embedding(1, 30)
  nn.init.xavier_uniform_(model.user_emb.weight)
  model.game_emb.weight.requires_grad = False
  data = []
  for i in index:
    data.append([i, 0, 10])
  df1 = pd.DataFrame(data, columns=['gameid','username','rating'])
  ds = GameDatasetTest(df1)
  ldr = torch.utils.data.DataLoader(ds, batch_size=1, shuffle=False)
  opt = optim.SGD([param for param in model.parameters() if param.requires_grad == True], lr=1e-6, momentum=0.9)
  crit = nn.MSELoss().to(device)
  sched = optim.lr_scheduler.OneCycleLR(opt, max_lr=1e-4, steps_per_epoch=len(ldr), epochs=50)
  model.to(device)
  for i in range(50):
    run_train(model, ldr, crit, opt, sched)
  preds = []
  for i in range(len(games_list)):
    if i not in index:
      product = model(torch.LongTensor([[0,i]]))
      preds.append([i, product])
  def sortFunc(p):
    return p[1]
  preds.sort(reverse=True, key=sortFunc)
  gameids = []
  for i in range(num_recs):
    gameid = games_list[preds[i][0]]
    gameids.append(gameid)
  return gameids

In [None]:
get_recs_user([136,137,141], model, unique_list, 5)

The inputs are: Assassin's Creed III Remastered, Assassin's Creed IV: Black Flag, and Assassin's Creed Rogue Remastered

The ouput of our model are: The Inpatient, Saints Row: Gat Out of Hell, Assassin's Creed Freedom Cry, Assassin's Creed: Revelations, Dynasty Warriors 9

The results include 2 other games from the assasin's creed series, and all the other games have some similar attributes as the input games, same as open-world genre. This suggests that our model is producing reasonable results for some inputs.

The model is also tested with item-to-item recommending by comparing the cosine similarity of all the entries in the games embedding, given a game as input:

In [134]:
def get_recs_game(gameid, model, games_list, num_recs):
  if gameid in [477,526,620,626,842,1000,1127,1130,1181,1194,1224,1403]:
    gameid = gameid - 1
  if gameid == 1404:
    gameid = 1402
  index = games_list.index(gameid)
  user_pref = model.game_emb.weight[index]
  cos = nn.CosineSimilarity(dim = 0)
  sims = []
  for i in range(len(games_list)):
    if i != index:
      game_param = model.game_emb.weight[i]
      similarity = cos(user_pref, game_param)
      sims.append([i, similarity])
  def sortFunc(p):
    return p[1]
  sims.sort(reverse=True, key=sortFunc)
  gameids = []
  for i in range(num_recs):
    gameid = games_list[sims[i][0]]
    gameids.append(gameid)
  return gameids

In [141]:
get_recs_game(319, model, unique_list, 10)

[520, 737, 1570, 283, 1292, 922, 735, 1355, 522, 732]

The input is Tom Clancy's The Division, and the outputs are Grow Home, Assassin's Creed Rogue Remastered, Tom Clancy's The Division 2, Knot, Grim Legends: The Forsaken Bride

The first three recommendations are reasonable, since they are all from Ubisoft and even another game in the same series is included. However the last two are not so similar to the input. We'll look into solving this problem by further augmenting the dataset or exploring fancier methods for deciding what to actually recommend