In [None]:
import pandas as pd
import numpy as np
from modules import data_sampling, SimRank, evaluation, BenchMark
from sklearn.model_selection import train_test_split

In [None]:
import random
random.seed(0)

In [None]:
# Load data
ratings_raw = pd.read_csv('ratings.csv')
tags_raw = pd.read_csv('genome-scores.csv')

In [None]:
C_item = C_user = 0.9
tag_relevance_cut_off = [0.1, 0.3, 0.5]
lambdas = [0.3, 0.5, 0.7, 0.9]
k_neighbors = [20, 40, 60, 80, 100]
k_matric = [5, 10]

In [None]:
tag_cos = SimRank.tag_simrank()
tag_jac = SimRank.tag_simrank()
wbs = SimRank.weighted_bipartite_simrank()
cf = BenchMark.cf_recommendation()

In [None]:
records_k5 = []

In [None]:
ratings, tags = data_sampling.align_items(ratings_raw, tags_raw)
rating_sample = data_sampling.sample_df(ratings, user_sample_n = 2000, item_sample_n = 200)
rating_train, rating_test = train_test_split(rating_sample, test_size = 0.15, random_state = 42)

user_grouped = evaluation.group(rating_test, 'user')

cf.fit(
    rating_train
)
tag_cos.fit(
    rating_train, 
    tags, 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
tag_jac.fit(
    rating_train, 
    tags[tags.relevance > 0.1], 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
wbs.fit(
    rating_train, 
    C_item = C_item, 
    C_user = C_user
)

for lbd in lambdas:
    S_user, S_item = tag_cos._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
    tag_cos.S_item = pd.DataFrame(S_item, index = tag_cos.items, columns = tag_cos.items)
    tag_cos.S_user = pd.DataFrame(S_user, index = tag_cos.users, columns = tag_cos.users)
    
    for cutoff in tag_relevance_cut_off:
        tag_jac.S_tag_based = tag_jac._cal_tab_based_S(tags[tags.relevance > cutoff], how = 'jac', GPU = False)
        
        S_user, S_item = tag_jac._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
        tag_jac.S_item = pd.DataFrame(S_item, index = tag_jac.items, columns = tag_jac.items)
        tag_jac.S_user = pd.DataFrame(S_user, index = tag_jac.users, columns = tag_jac.users)
            
        for k in k_neighbors:
            pred_tag_cos = tag_cos.cf_recommendation(rating_test, k = k)
            pred_tag_jac = tag_jac.cf_recommendation(rating_test, k = k)
            pred_wbs = wbs.cf_recommendation(rating_test, k = k)
            pred_cf = cf.predict(rating_test, k = k)
            
            for n in k_matric:
                precision_tag_cos, recall_tag_cos, ndcg_tag_cos = evaluation.evaluation_at_k(
                    rating_test,
                    pred_tag_cos,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_tag_jac, recall_tag_jac, ndcg_tag_jac = evaluation.evaluation_at_k(
                    rating_test,
                    pred_tag_jac,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_wbs, recall_wbs, ndcg_wbs = evaluation.evaluation_at_k(
                    rating_test,
                    pred_wbs,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_cf, recall_cf, ndcg_cf = evaluation.evaluation_at_k(
                    rating_test,
                    pred_cf,
                    user_grouped = user_grouped,
                    k = n
                )

                records_k5.append({
                    'lbd': lbd,
                    'cutoff': cutoff, 
                    'k_neighbors': k,
                    'k_matric': n,
                    'precision_tag_cos': precision_tag_cos,
                    'recall_tag_cos': recall_tag_cos,
                    'ndcg_tag_cos': ndcg_tag_cos,
                    'precision_tag_jac': precision_tag_jac,
                    'recall_tag_jac': recall_tag_jac,
                    'ndcg_tag_jac': ndcg_tag_jac,
                    'precision_wbs': precision_wbs,
                    'recall_wbs': recall_wbs,
                    'ndcg_wbs': ndcg_wbs,
                    'precision_cf': precision_cf,
                    'recall_cf': recall_cf,
                    'ndcg_cf': ndcg_cf
                })

In [None]:
rst = pd.DataFrame(records)

In [None]:
rst

In [None]:
records_big = []

In [None]:
ratings, tags = data_sampling.align_items(ratings_raw, tags_raw)
rating_sample = data_sampling.sample_df(ratings, item_thresh = 50, user_sample_n = 4000, item_sample_n = 1000)
rating_train, rating_test = train_test_split(rating_sample, test_size = 0.15, random_state = 42)

user_grouped = evaluation.group(rating_test, 'user')

cf.fit(
    rating_train
)
tag_cos.fit(
    rating_train, 
    tags, 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
tag_jac.fit(
    rating_train, 
    tags[tags.relevance > 0.1], 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
wbs.fit(
    rating_train, 
    C_item = C_item, 
    C_user = C_user
)

for lbd in lambdas:
    S_user, S_item = tag_cos._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
    tag_cos.S_item = pd.DataFrame(S_item, index = tag_cos.items, columns = tag_cos.items)
    tag_cos.S_user = pd.DataFrame(S_user, index = tag_cos.users, columns = tag_cos.users)
    
    for cutoff in tag_relevance_cut_off:
        tag_jac.S_tag_based = tag_jac._cal_tab_based_S(tags[tags.relevance > cutoff], how = 'jac', GPU = False)
        
        S_user, S_item = tag_jac._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
        tag_jac.S_item = pd.DataFrame(S_item, index = tag_jac.items, columns = tag_jac.items)
        tag_jac.S_user = pd.DataFrame(S_user, index = tag_jac.users, columns = tag_jac.users)
            
        for k in k_neighbors:
            pred_tag_cos = tag_cos.cf_recommendation(rating_test, k = k)
            pred_tag_jac = tag_jac.cf_recommendation(rating_test, k = k)
            pred_wbs = wbs.cf_recommendation(rating_test, k = k)
            pred_cf = cf.predict(rating_test, k = k)
            
            for n in k_matric:
                precision_tag_cos, recall_tag_cos, ndcg_tag_cos = evaluation.evaluation_at_k(
                    rating_test,
                    pred_tag_cos,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_tag_jac, recall_tag_jac, ndcg_tag_jac = evaluation.evaluation_at_k(
                    rating_test,
                    pred_tag_jac,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_wbs, recall_wbs, ndcg_wbs = evaluation.evaluation_at_k(
                    rating_test,
                    pred_wbs,
                    user_grouped = user_grouped,
                    k = n
                )
                precision_cf, recall_cf, ndcg_cf = evaluation.evaluation_at_k(
                    rating_test,
                    pred_cf,
                    user_grouped = user_grouped,
                    k = n
                )

                records_big.append({
                    'lbd': lbd,
                    'cutoff': cutoff, 
                    'k_neighbors': k,
                    'k_matric': n,
                    'precision_tag_cos': precision_tag_cos,
                    'recall_tag_cos': recall_tag_cos,
                    'ndcg_tag_cos': ndcg_tag_cos,
                    'precision_tag_jac': precision_tag_jac,
                    'recall_tag_jac': recall_tag_jac,
                    'ndcg_tag_jac': ndcg_tag_jac,
                    'precision_wbs': precision_wbs,
                    'recall_wbs': recall_wbs,
                    'ndcg_wbs': ndcg_wbs,
                    'precision_cf': precision_cf,
                    'recall_cf': recall_cf,
                    'ndcg_cf': ndcg_cf
                })

In [None]:
rst_2000 = pd.DataFrame(records_k5)

In [None]:
rst_4000 = pd.DataFrame(records_big)

In [None]:
rst_2000[rst_2000.precision_tag_cos > rst_2000.precision_wbs]

In [None]:
rst_4000[rst_4000.precision_tag_jac > rst_4000.precision_wbs]

In [None]:
rating_sample

In [None]:
n_user = rating_sample['userId'].nunique()
n_item = rating_sample['movieId'].nunique()
sparsity = round(1.0-len(rating_sample)/float(n_user * n_item), 5)

print(f'number of users: {n_user}')
print(f'number of items: {n_item}')
print(f'sparsity: {sparsity * 100}%')


In [None]:
import math

In [None]:
def train_test_split_by_time(df, test_size = 0.2):
    train_df = df.groupby('userId').apply(lambda x: x.nsmallest(math.ceil(len(x) * (1 - test_size)), 'timestamp')).reset_index(drop = True)
    test_df = df.groupby('userId').apply(lambda x: x.nlargest(math.floor(len(x) * test_size), 'timestamp')).reset_index(drop = True)
    return train_df, test_df

In [None]:
# 20000 user, 1000 item, by timestamp
rating_sample_time = data_sampling.sample_df(ratings)
rating_train_time, rating_test_time = train_test_split_by_time(rating_sample_time)

In [41]:
print(len(rating_train_time))
print(len(rating_test_time))

1009114
242268


In [42]:
n_user = rating_sample_time['userId'].nunique()
n_item = rating_sample_time['movieId'].nunique()
sparsity = round(1.0-len(rating_sample_time)/float(n_user * n_item), 5)

print(f'number of users: {n_user}')
print(f'number of items: {n_item}')
print(f'sparsity: {sparsity * 100}%')

number of users: 20000
number of items: 1000
sparsity: 93.743%


In [None]:
user_grouped_time = evaluation.group(rating_test_time, 'user')

cf.fit(
    rating_train_time
)
tag_cos.fit(
    rating_train_time, 
    tags, 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
tag_jac.fit(
    rating_train_time, 
    tags[tags.relevance > 0.1], 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
wbs.fit(
    rating_train_time, 
    C_item = C_item, 
    C_user = C_user
)

for lbd in lambdas:
    S_user, S_item = tag_cos._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
    tag_cos.S_item = pd.DataFrame(S_item, index = tag_cos.items, columns = tag_cos.items)
    tag_cos.S_user = pd.DataFrame(S_user, index = tag_cos.users, columns = tag_cos.users)
    
    for cutoff in tag_relevance_cut_off:
        tag_jac.S_tag_based = tag_jac._cal_tab_based_S(tags[tags.relevance > cutoff], how = 'jac', GPU = False)
        
        S_user, S_item = tag_jac._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
        tag_jac.S_item = pd.DataFrame(S_item, index = tag_jac.items, columns = tag_jac.items)
        tag_jac.S_user = pd.DataFrame(S_user, index = tag_jac.users, columns = tag_jac.users)
            
        for k in k_neighbors:
            pred_tag_cos = tag_cos.cf_recommendation(rating_test_time, k = k)
            pred_tag_jac = tag_jac.cf_recommendation(rating_test_time, k = k)
            pred_wbs = wbs.cf_recommendation(rating_test_time, k = k)
            pred_cf = cf.predict(rating_test_time, k = k)
            
            for n in k_matric:
                precision_tag_cos, recall_tag_cos, ndcg_tag_cos = evaluation.evaluation_at_k(
                    rating_test_time,
                    pred_tag_cos,
                    user_grouped = user_grouped_time,
                    k = n
                )
                precision_tag_jac, recall_tag_jac, ndcg_tag_jac = evaluation.evaluation_at_k(
                    rating_test_time,
                    pred_tag_jac,
                    user_grouped = user_grouped_time,
                    k = n
                )
                precision_wbs, recall_wbs, ndcg_wbs = evaluation.evaluation_at_k(
                    rating_test_time,
                    pred_wbs,
                    user_grouped = user_grouped_time,
                    k = n
                )
                precision_cf, recall_cf, ndcg_cf = evaluation.evaluation_at_k(
                    rating_test_time,
                    pred_cf,
                    user_grouped = user_grouped_time,
                    k = n
                )

                records_big.append({
                    'lbd': lbd,
                    'cutoff': cutoff, 
                    'k_neighbors': k,
                    'k_matric': n,
                    'precision_tag_cos': precision_tag_cos,
                    'recall_tag_cos': recall_tag_cos,
                    'ndcg_tag_cos': ndcg_tag_cos,
                    'precision_tag_jac': precision_tag_jac,
                    'recall_tag_jac': recall_tag_jac,
                    'ndcg_tag_jac': ndcg_tag_jac,
                    'precision_wbs': precision_wbs,
                    'recall_wbs': recall_wbs,
                    'ndcg_wbs': ndcg_wbs,
                    'precision_cf': precision_cf,
                    'recall_cf': recall_cf,
                    'ndcg_cf': ndcg_cf
                })