## Set-up

In [1]:
# Install Cornac

!pip install --quiet cornac==1.5.2 adjustText

[K     |████████████████████████████████| 10.0 MB 4.4 MB/s 
[?25h  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


In [17]:
# Set seed

USE_SEED = True

if USE_SEED:
    SEED = 42
else:
    SEED = None

VERBOSE = True

In [7]:
# Install libraries

from collections import defaultdict
import cornac
from cornac.utils import cache
from datetime import datetime
import json
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import sys

In [4]:
# Set-up Google Drive

COLAB = True

if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  path =  "/content/drive/MyDrive/mitb/recommender-systems-cs608/project/"
else:
  path = ""

Mounted at /content/drive


In [5]:
!git clone https://github.com/srendle/libfm.git
!make all -C libfm

Cloning into 'libfm'...
remote: Enumerating objects: 233, done.[K
remote: Total 233 (delta 0), reused 0 (delta 0), pack-reused 233[K
Receiving objects: 100% (233/233), 129.46 KiB | 1.75 MiB/s, done.
Resolving deltas: 100% (112/112), done.
make: Entering directory '/content/libfm'
cd src/libfm; make all
make[1]: Entering directory '/content/libfm/src/libfm'
g++ -O3 -Wall -c libfm.cpp -o libfm.o
mkdir -p ../../bin/
g++ -O3 -Wall libfm.o -o ../../bin/libFM
g++ -O3 -Wall -c tools/transpose.cpp -o tools/transpose.o
mkdir -p ../../bin/
g++ -O3 tools/transpose.o -o ../../bin/transpose
g++ -O3 -Wall -c tools/convert.cpp -o tools/convert.o
mkdir -p ../../bin/
g++ -O3 tools/convert.o -o ../../bin/convert
make[1]: Leaving directory '/content/libfm/src/libfm'
make: Leaving directory '/content/libfm'


## Load data

In [8]:
with open(path + "/data/original/wine-info.json") as f:
  wine_info = json.load(f)

In [9]:
wine_info = pd.DataFrame(wine_info)
wine_info

Unnamed: 0,wine_id,wine_url,wine_name,producer,region_name,country_name,average_rating,nr_ratings,wine_image_url,food_pairings,wine_style
0,1239886,/SG/en/clos-canarelli-corse-figari/w/1239886?y...,Corse Figari,Clos Canarelli,Corse Figari,France,4.2,216,//images.vivino.com/thumbs/5s4Unq0rQkumOx9etvh...,[],
1,5047087,/SG/en/abel-mendoza-monge-malvasia/w/5047087?y...,Malvasía,Abel Mendoza Monge,Rioja,Spain,3.9,74,//images.vivino.com/thumbs/wV951FvQTcKDD56rH4f...,"[Shellfish, Appetizers and snacks, Lean fish, ...",Spanish Rioja White
2,79489,/SG/en/clonakilla-viognier-nouveau/w/79489?yea...,Viognier Nouveau,Clonakilla,Canberra District,Australia,3.9,50,//images.vivino.com/thumbs/022fsiry7i5bo_150x2...,"[Pork, Rich fish (salmon, tuna etc), Spicy foo...",Australian Viognier
3,6857046,/SG/en/nepenthe-the-luminary-shiraz/w/6857046?...,The Luminary Shiraz,Nepenthe,Mount Lofty Ranges,Australia,0.0,1,//images.vivino.com/thumbs/GPm08f-uSRGDk7Pta52...,"[Beef, Lamb, Game (deer, venison), Poultry]",Australian Shiraz
4,5422196,/SG/en/basket-range-vineyard-blend/w/5422196?y...,Vineyard Blend,Basket Range,Adelaide Hills,Australia,3.8,16,//images.vivino.com/thumbs/kSRloTiESjiAgFdFCvS...,"[Beef, Pasta, Lamb, Game (deer, venison), Poul...",Australian Bordeaux Blend
...,...,...,...,...,...,...,...,...,...,...,...
44720,1371807,/SG/en/pasos-de-tango-cabernet-sauvignon/w/137...,Cabernet Sauvignon,Pasos de Tango,Mendoza,Argentina,0.0,9,//images.vivino.com/thumbs/Ulo2dsz2QTqFqPuhjAr...,[],
44721,15814,/SG/en/sella-mosca-tanca-farra-alghero/w/15814...,Tanca Farra Alghero,Sella & Mosca,Alghero,Italy,3.9,1441,//images.vivino.com/thumbs/BIzMemGOTE-XrrMKRzi...,[],
44722,1605551,/SG/en/chateau-haut-brisson-la-reserve-saint-e...,La Réserve Saint-Émilion Grand Cru,Château Haut-Brisson,Saint-Émilion Grand Cru,France,3.9,261,//images.vivino.com/thumbs/7i9vVv9fRbSIPnnOdf0...,[],
44723,9640053,/SG/en/witches-falls-wild-ferment-aglianico/w/...,Wild Ferment Aglianico,Witches Falls,South Australia,Australia,3.8,12,//images.vivino.com/thumbs/Vv-JVWAuRuqSJt5psm1...,[],


In [11]:
food_info = wine_info.copy()
food_info.drop(['wine_url', 'wine_name', 'producer', 'region_name', 'country_name', 'average_rating', 'nr_ratings', 'wine_image_url', 'wine_style'], axis=1, inplace=True)
food_info.head()

Unnamed: 0,wine_id,food_pairings
0,1239886,[]
1,5047087,"[Shellfish, Appetizers and snacks, Lean fish, ..."
2,79489,"[Pork, Rich fish (salmon, tuna etc), Spicy foo..."
3,6857046,"[Beef, Lamb, Game (deer, venison), Poultry]"
4,5422196,"[Beef, Pasta, Lamb, Game (deer, venison), Poul..."


In [38]:
foods = []
for i in range(len(food_info)):
  for j in range(len(food_info.food_pairings[i])):
    if food_info.food_pairings[i][j] not in foods:
      foods.append(food_info.food_pairings[i][j])
foods_df = pd.DataFrame(foods, columns=["food"])
foods_df
ids = pd.DataFrame(list(range(1,len(foods_df)+1)), columns=["id"])
ids
foods_ids = pd.concat([ids, foods_df], axis=1)
foods_ids

Unnamed: 0,id,food
0,1,Shellfish
1,2,Appetizers and snacks
2,3,Lean fish
3,4,Cured Meat
4,5,Pork
5,6,"Rich fish (salmon, tuna etc)"
6,7,Spicy food
7,8,Poultry
8,9,Beef
9,10,Lamb


In [None]:
wine_food_pairings = defaultdict(set)
for wine_id, food_pairings, *_ in food_info.itertuples(index=False):
  for i in range(len(food_pairings)):
    food = foods_ids.index[foods_ids["food"]==food_pairings[i]][0]
    wine_food_pairings[wine_id].add(foods_ids["id"][food])

In [64]:
wine_food_pairings

defaultdict(set,
            {'5047087': {1, 2, 3, 4},
             '79489': {5, 6, 7, 8},
             '6857046': {8, 9, 10, 11},
             '5422196': {8, 9, 10, 11, 12},
             '1238219': {9, 10, 11, 12},
             '1246050': {8, 9, 10, 13},
             '1776233': {4, 5, 6, 8, 14},
             '2051324': {1, 4, 6, 12, 15},
             '1725707': {8, 9, 10, 11},
             '6200146': {1, 4, 5, 16},
             '1633327': {9, 12, 13},
             '1173229': {9, 12, 13},
             '1222238': {4, 5, 6, 7, 16},
             '1148247': {1, 16, 17},
             '93655': {9, 12, 13},
             '2166006': {8, 9, 11, 13},
             '1129317': {5, 8, 11, 12, 13},
             '1689940': {1, 4, 5, 7, 8},
             '1400098': {8, 9, 10},
             '76031': {8, 9, 10, 11},
             '1212457': {8, 9, 12, 13},
             '63524': {1, 16, 17},
             '2150072': {8, 9, 10, 11},
             '1540587': {8, 9, 11, 13},
             '1174123': {9, 10, 13},
 

In [51]:
# Load training data

train_ratings_seen = pd.read_csv(path + 'train_test_split/train_ratings_seen.csv')
train_ratings_seen_df = pd.DataFrame(train_ratings_seen, columns=["user_id", "item_id", "rating"])
print(train_ratings_seen_df.shape)
train_ratings_seen_df.head()

(119922, 3)


Unnamed: 0,user_id,item_id,rating
0,34297304,1260259,4.0
1,34297304,1137544,3.0
2,34297304,1372673,1.0
3,34297304,2511537,3.0
4,34297304,92537,4.0


In [52]:
# Load test data

test_ratings_unseen = pd.read_csv(path + 'train_test_split/test_ratings_unseen.csv')
test_ratings_unseen_df = pd.DataFrame(test_ratings_unseen, columns=["user_id", "item_id", "rating"])
print(test_ratings_unseen_df.shape)
test_ratings_unseen_df.head()

(22492, 3)


Unnamed: 0,user_id,item_id,rating
0,34297304,1205151,1.0
1,34297304,19481,3.0
2,34297304,1251093,4.0
3,34297304,24169,2.0
4,34297304,1221087,2.0


In [53]:
# Concatenate the data sets
both_df = pd.concat([train_ratings_seen_df, test_ratings_unseen_df], ignore_index=True, axis=0)
print(both_df.shape)
both_df.head()

(142414, 3)


Unnamed: 0,user_id,item_id,rating
0,34297304,1260259,4.0
1,34297304,1137544,3.0
2,34297304,1372673,1.0
3,34297304,2511537,3.0
4,34297304,92537,4.0


## Data Statistics

In [55]:
n_users = train_ratings_seen.user_id.nunique()
n_wines = train_ratings_seen.item_id.nunique()
n_foods = foods_ids.id.nunique()

print("Number of users:", n_users)
print("Number of wines:", n_wines)
print("Number of ratings:", len(train_ratings_seen))
print("Number of foods:", n_foods)

Number of users: 5066
Number of wines: 39520
Number of ratings: 119922
Number of foods: 22


## Traditional Matrix Factorization

In [56]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_ratings_seen_df.itertuples(index=False)), 
  test_data=list(test_ratings_unseen_df.itertuples(index=False)),
  exclude_unknowns=False, 
  verbose=VERBOSE,
  seed=SEED,
)

mf = cornac.models.MF(
  k=10, 
  max_iter=20, 
  learning_rate=0.01, 
  lambda_reg=0.02, 
  use_bias=True,
  verbose=VERBOSE, seed=SEED,
)

test_result, _ = eval_method.evaluate(
  model=mf, metrics=[cornac.metrics.RMSE()], user_based=False
)
print(test_result)

rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 5066
Number of items = 39520
Number of ratings = 119922
Max rating = 5.0
Min rating = 1.0
Global mean = 3.9
---
Test data:
Number of users = 4855
Number of items = 9900
Number of ratings = 22492
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5066
Total items = 39520

[MF] Training started!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Rating:   0%|          | 0/22492 [00:00<?, ?it/s]

   |   RMSE | Train (s) | Test (s)
-- + ------ + --------- + --------
MF | 0.5663 |    0.3026 |   0.5627



## Factorization Machines with Contextual Information

In [60]:
user_id2idx = eval_method.global_uid_map
wine_id2idx = eval_method.global_iid_map

# create mapping for foods
food_id2idx = defaultdict()
for food_id, _ in foods_ids.itertuples(index=False):
  food_id2idx.setdefault(food_id, len(food_id2idx))
assert len(food_id2idx) == n_foods

In [66]:
def to_fm_sparse_fmt(rating, user_id, wine_id, foods):
  # order of features: user, item, food
  user_start_idx = 0
  wine_start_idx = n_users
  food_start_idx = wine_start_idx + n_wines
  return "{} {}:1 {}:1 {}\n".format(
    rating,
    user_id2idx[user_id],
    wine_id2idx[item_id] + wine_start_idx,
    " ".join("{}:1".format(food_id2idx[t] + food_start_idx) for food in foods)
  )

# save training data to file
with open("train.libfm", "w") as f:
  for user_id, item_id, rating, *_ in train_ratings_seen_df.itertuples(index=False):
    f.write(to_fm_sparse_fmt(rating, user_id, item_id, wine_food_pairings[(item_id)]))

# save test data to file
with open("test.libfm", "w") as f:
  for user_id, item_id, rating, *_ in test_ratings_unseen_df.itertuples(index=False):
    f.write(to_fm_sparse_fmt(rating, user_id, item_id, wine_food_pairings[(item_id)]))

In [67]:
with open("train.libfm", "r") as f:
  print(f)

<_io.TextIOWrapper name='train.libfm' mode='r' encoding='UTF-8'>


In [68]:
!head train.libfm

4.0 0:1 5066:1 
3.0 0:1 5067:1 
1.0 0:1 5068:1 
3.0 0:1 5069:1 
4.0 0:1 5070:1 
3.0 0:1 5071:1 
4.0 0:1 5072:1 
4.0 0:1 5073:1 
3.0 0:1 5074:1 
3.0 0:1 5075:1 


In [69]:
!head test.libfm

1.0 0:1 30835:1 
3.0 0:1 7055:1 
4.0 0:1 5978:1 
2.0 0:1 11559:1 
2.0 0:1 9328:1 
2.0 0:1 8549:1 
3.0 0:1 36345:1 
3.0 0:1 5197:1 
4.0 1:1 5212:1 
3.0 1:1 8516:1 


In [70]:
!./libfm/bin/libFM

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
-cache_size     cache size for data storage (only applicable if data is
                in binary format), default=infty
-dim            'k0,k1,k2': k0=use bias, k1=use 1-way interactions,
                k2=dim of 2-way interactions; default=1,1,8
-help           this screen
-init_stdev     stdev for initialization of 2-way factors; default=0.1
-iter           number of iterations; default=100
-learn_rate     learn_rate for SGD; default=0.1
-load_model     filename for reading the FM model
-meta           filename for meta information about dat

In [72]:
!./libfm/bin/libFM -task r -train train.libfm -test test.libfm -seed $SEED -dim "1,1,10" -iter 500

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=119922	num_values=239844	num_features=44586	min_target=1	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=22492	num_values=44984	num_features=44578	min_target=1	max_target=5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.691464	Test=0.674616
#Iter=  1	Train=0.613796	Test=0.632259
#Iter=  2	Train=0.601766	Test=0.617064
#Iter=  3	Train=0.59441	Test=0.608336
#Iter=  4	Train=0.588267	Test=0.602389
#Iter=  5	Train=0.581311	Test=0.597442
#Iter=  6	Train=0.575853	Test=0.593897
#Iter=  7