## 1. Setup


In [1]:
import os
import sys
from collections import defaultdict

import numpy as np
import pandas as pd; pd.set_option("max_colwidth", 0);
import re
import cornac
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

#%tensorflow_version 1.x
import tensorflow as tf

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")
print(f"Tensorflow version: {tf.__version__}")

SEED = 42
VERBOSE = True

  from .autonotebook import tqdm as notebook_tqdm


FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.14.2
Tensorflow version: 1.15.0


In [3]:
#import cornac
from cornac.models import  WMF, MostPop, CTR
from cornac.eval_methods import RatioSplit,StratifiedSplit, BaseMethod

rmse = cornac.metrics.RMSE()
auc=cornac.metrics.AUC()
mrr=cornac.metrics.MRR()
K=20

ncrr_20 = cornac.metrics.NCRR(k=K)
ndcg_20 = cornac.metrics.NDCG(k=K)
f1_20=cornac.metrics.FMeasure(k=K)
rec_20 = cornac.metrics.Recall(k=K)

 
metrics=[auc,mrr,ncrr_20,ndcg_20,f1_20,rec_20] # standadized metrics for model comparison

In [4]:
def value_counts_df(df, col):
    df = pd.DataFrame(df[col].value_counts())
    df.index.name = col
    df.columns = ['count']
    return df
def sparsity(df):
    l=df.columns 
    a,b=df[l[0]].nunique(),df[l[1]].nunique()
    x=len(df)/a/b
    return x
def to_obj(df):
    return  cornac.data.Dataset.from_uir(df.itertuples(index=False))


## 2. Data Preparation- using preprocessed reviews

In [5]:

url,url2='data/train_ratings_seen.csv','data/test_ratings_unseen.csv'

train_df = pd.read_csv(url,sep=",",header=0, names=["UserID", "ItemID", "Rating"])
train_df["Rating"]= pd.to_numeric(train_df["Rating"], errors='coerce')

test_df = pd.read_csv(url2,sep=",",header=0, names=["UserID", "ItemID", "Rating"])
test_df["Rating"]= pd.to_numeric(test_df["Rating"], errors='coerce')
#testdata = cornac.data.Dataset.from_uir(test_df.itertuples(index=False))
UC=value_counts_df(train_df ,"UserID")
IC=value_counts_df(train_df,"ItemID")
NUSERS=len(UC)
NITEMS=len(IC)
print(f"Training data contains {NUSERS} users and {NITEMS} items")
percentage="{:.4%}".format(sparsity(train_df))
print(f"Spasity of training rating matrix is {percentage}")
# percentage="{:.4%}".format(spstest)
# print(f"Spasity of training rating matrix is {percentage}")

Training data contains 5066 users and 39520 items
Spasity of training rating matrix is 0.0599%


In [6]:
default=IC.reset_index().copy()
default["Review"]="This wine"
default=default[["ItemID","Review"]]

In [8]:
#10 reviews per item, uuser_id	item_id	review
review_df=pd.read_csv("data/reviews_10_processed.csv", header=0,usecols=[1,2])
review_df.columns=default.columns

name_df=pd.read_csv("data/wine_info_all.csv", header=0)
name_df['Wine']=name_df['Wine'].astype(str)
review_df['Review']=review_df['Review'].astype(str)
review_df=pd.concat([default,review_df])
#review_df_v2=pd.concat([default,review_df_v2])

In [10]:
review_df

Unnamed: 0,ItemID,Review
0,1652,This wine
1,1135843,This wine
2,7103,This wine
3,18978,This wine
4,14362,This wine
...,...,...
423838,2204182,caramel apples and butter a really nice wine typical aged bourgundy chardonnay sad to say that my boss have no taste at all thought that he had a palate worth his income i was wrong
423839,2204182,this 1er cru seizes the senses by confirming its terroir provenance but with in a modern kind of approachindeedeven though producers affirm the use of 20 new oaktypical vergelesses aromas of berry fruitplum geraniumvioletliquorice and earthiness are rapped up by the wines toast and vanillin notesbut in a subtle waymed bodythe wine seats on a fleshy structure with dry tanninss and a limey acidity that prolongs the fresh and mineral finishhold for couple of years to allow tannins integration
423840,2204182,excellent the makings of bankruptcy court beautiful perfumed nose that is intense and very aromatic with definite hints of oak very long length with excellent structure and a tingle of acidity minerally palate with hints of stone fruit and salted butter extremely well balanced wine a work of art
423841,2204182,8992100 light but intense the wine shows lichee and pear a nice fresh fruit touch but the minerality and the touch of oak is what it makes special a light mix inbetween dagueneau and coche not that niveau but the art of wine great value wwwrullivinsch


In [11]:

import pandas as pd

def selected_items(df):
    '''
    a dictionary of already selected items, input df in uir format
    '''
    selected={}
    for (x,y,_) in df.itertuples(index=False):
        selected[x]=[]
    for (x,y,_) in df.itertuples(index=False):    
        selected[x].append(y)
    return selected
train_seleted=selected_items(train_df)
test_seleted=selected_items(test_df)

ERROR! Session/line number was not unique in database. History logging moved to new session 2079



### Preprocessing: combine all the review of one item; remove items with blank review


In [12]:

reviews= review_df.groupby(["ItemID"])["Review"].apply(lambda x: ' '.join(x)).reset_index()


In [13]:

all_df=pd.concat([train_df,test_df])
ratings= [(x,y,z) for idx,x,y,z in train_df.itertuples()]
ratings_test= [(x,y,z) for idx,x,y,z in test_df.itertuples()]
combine_name=False
review_data= [(y,r) for _,y,r in reviews.itertuples() ]
item_ids ,docs= [v[0] for v in review_data] , [v[1] for v in review_data]   #amazon_clothing.load_text()

print(docs[100])

pleasant without being outstanding suitable bbq wine especially with atmosphere of a warm evening on the beach in stlucia oak blackberry rich very good pencil shavings taste full body but not fruit oaky and slightly tart good value easy tasting seams pure alcahol appears purple but actually deep ruby black fruit on the nose lovely soft inky texture full body medium tanninss high dryness med acidity bold fruit forward blackberry chilli black pepper a great wine and great qpr if you are looking for a big wine at a relatively small price point i can’t agree with the site ranking because this wine is very good complex harmonic and well structured 40  cheers   smooth  with some structure  good choice for the price excellent  perfect timing for this 2013 vintage  purple red color woody and fruit nicely structured but lacks some character


In [14]:
reviews.to_csv("data/reviews_10_dataframe_UID.csv")

In [15]:

item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
    tokenizer=BaseTokenizer(sep=" ", stop_words="english"),
    max_vocab=3000,
    max_doc_freq=0.5,
    use_idf=False
)

evaluation_method=BaseMethod.from_splits(train_data=ratings, test_data=ratings_test,
                                         rating_threshold=3.5,
                                         exclude_unknowns=True,
                                         item_text=item_text_modality,
                                         verbose=VERBOSE,
                                         seed=SEED,
                                         )


rating_threshold = 3.5
exclude_unknowns = True
---
Training data:
Number of users = 5066
Number of items = 39520
Number of ratings = 119922
Max rating = 5.0
Min rating = 1.0
Global mean = 3.9
---
Test data:
Number of users = 4855
Number of items = 9900
Number of ratings = 22492
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5066
Total items = 39520


In [51]:

most_pop = cornac.models.MostPop()
K =20
ctr1 = CTR(k=K, max_iter=60, a=1.0, b=0.01, lambda_u=0.001, lambda_v=0.001, verbose=VERBOSE, seed=SEED)
K =50
ctr2 = CTR(k=K, max_iter=60, a=1.0, b=0.01, lambda_u=0.001, lambda_v=0.001, verbose=VERBOSE, seed=SEED)

K= 80
ctr3 = CTR(k=K, max_iter=60, a=1.0, b=0.01, lambda_u=0.001, lambda_v=0.001, verbose=VERBOSE, seed=SEED)
del K
models= [most_pop,ctr1,ctr2,ctr3]
#metrics=[rec_20,rec_50,rec_100,ndcg_20,ndcg_50,ndcg_100]

cornac.Experiment(eval_method=evaluation_method, models=models, metrics=metrics).run()

### Output and evaluation

In [18]:
def get_top_N(model,N, userList=None,dic_seleted=None, verbose=False):

    '''
    get top N item raw ids as a list, input can be a list if user raw ids
    train_seleted:  =selected_items(train_df)
    '''
    output={}
    users=list(model.train_set.user_ids)
    items=list(model.train_set.item_ids)
    if userList==None:
        userList=users
    for user in userList:
        if verbose:
            if user%100==0: print(user)
        uid=users.index(user)
        score_all=(model.rank(uid))[0][:N+50]
        lst0=[items[a] for a in score_all]
        lst= [x for x in lst0 if x not in dic_seleted[user]][:N]
        #output[user]=str(user)+' : '+' '.join(lst)
        output[user]=lst#' '.join(lst)
            
    assert (len(output)==len(userList))
    return output

def print_top_N(model,N, user, dic_seleted=None):
    '''
    get top N item raw ids as a list, input a single user raw id
    '''
    output={}
    users=list(model.train_set.user_ids)
    items=list(model.train_set.item_ids)
    uid=users.index(user)
    score_all=(model.rank(uid))[0][:N+50]
    lst0=[items[a] for a in score_all]
    lst= [x for x in lst0 if x not in dic_seleted[user]][:N]
    #output[user]=str(user)+' : '+' '.join(lst)
    output[user]=lst#' '.join(lst)
    return(output[user])        

def evaluate_model(model, N,train_seleted=None, test_seleted=None,test_df=None):
    '''
    recall score if only recommend unseen, return a list,  and mean among users
    test_seleted: =selected_items(test_df)
    '''
    to_select=get_top_N(model,N+50,dic_seleted=train_seleted)
    if test_seleted==None:
        test_seleted=selected_items(test_df)
    recalls=[]
    for user in test_seleted:
        all_list= test_seleted[user]
        recall_temp= len([x for x in to_select[user] if x in all_list])/len(all_list)
        recalls.append(recall_temp)
        
    return recalls, np.mean(np.array(recalls))
    

In [19]:
#A dictionary of already selected items
train_seleted=selected_items(train_df)
test_seleted=selected_items(test_df)


In [20]:
# ctr1=CTR.load("CTR1_20_60_001_001/CTR/2022-06-21_11-00-23-909216.pkl", trainable=False)
# ctr2=CTR.load("CTR2_50_60_01_001/CTR/2022-06-21_11-00-23-927217.pkl", trainable=False)
# ctr3=CTR.load("CTR3_80_60_001_001/CTR/2022-06-21_11-00-23-959217.pkl", trainable=False)


# model_list=[ctr1,ctr2,ctr3]
# for m in model_list:
#     m.fit(train_set=to_obj(train_df), val_set=None)

### Print the wine informations to assess models strengths
### What if we recommed only unseen items?

In [22]:
wine_info=pd.read_csv("data/wine_info_all.csv", index_col="Wine ID")

### User's current selection

In [44]:
my_id=58534725
topN=10

wine_info.loc[train_seleted[my_id]] 

Unnamed: 0_level_0,Wine,region,country,winery
Wine ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24056,Bin 389 Cabernet - Shiraz,South Australia,Australia,Penfolds
10241561,Le Tradition Châteauneuf-du-Pape,Châteauneuf-du-Pape,France,Clos du Caillou
8584824,Neumagener Rosengartchen Riesling Auslese,Mosel,Germany,Ansgar Clüsserath
18978,Sauvignon Blanc,Marlborough,New Zealand,Cloudy Bay
3071535,Private Selection Old Vines Malbec,Mendoza,Argentina,La Linda


### Model recommendations

#### Recommend most popular items

In [45]:
wine_info.loc[print_top_N(most_pop,topN,my_id,dic_seleted=train_seleted)] 

Unnamed: 0_level_0,Wine,region,country,winery
Wine ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1652,Tignanello,Toscana,Italy,Antinori
1135843,Réserve (de la Comtesse) Pauillac,Pauillac,France,Château Pichon Longueville Comtesse de Lalande
7103,Amarone della Valpolicella Classico,Amarone della Valpolicella Classico,Italy,Tommasi
14362,Pauillac (Grand Cru Classé),Pauillac,France,Château Pontet-Canet
75626,Amarone della Valpolicella,Amarone della Valpolicella,Italy,Montresor
7972,Rioja Gran Reserva 904,Rioja,Spain,La Rioja Alta
5078,Sassicaia,Bolgheri Sassicaia,Italy,Tenuta San Guido
75190,Rouge (Gaston Hochar),Bekaa Valley,Lebanon,Château Musar
86684,Brut Champagne,Champagne,France,Dom Pérignon
3347644,Neropasso Veneto Rosso,Veneto,Italy,Biscardo


#### CTR model recommendation

In [46]:
wine_info.loc[print_top_N(model_list[0],topN,my_id,dic_seleted=train_seleted)] 

Unnamed: 0_level_0,Wine,region,country,winery
Wine ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
73660,Knights Valley Cabernet Sauvignon,Knights Valley,United States,Beringer
2316699,Red Blend,Victoria,Australia,19 Crimes
1136381,Cabernet Sauvignon,Napa Valley,United States,Stags' Leap
79160,Grand Brut Champagne,Champagne,France,Perrier-Jouët
1136029,Amarone della Valpolicella Classico,Amarone della Valpolicella Classico,Italy,Cesari
1163903,Rioja Reserva,Rioja,Spain,Marqués de Riscal
1562589,Q Red Blend,Napa Valley,United States,Beringer
3736213,Cabernet Sauvignon,Victoria,Australia,19 Crimes
86684,Brut Champagne,Champagne,France,Dom Pérignon
79385,Sauvignon Blanc,Marlborough,New Zealand,Dog Point
