In [1]:
import pandas as pd
import numpy as np
from modules import data_sampling, SimRank, evaluation, BenchMark
from sklearn.model_selection import train_test_split

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
import random 
random.seed(0)

In [4]:
import pickle as pkl

In [5]:
# Load data
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('genome-scores.csv')

In [6]:
ratings, tags = data_sampling.align_items(ratings, tags)

In [7]:
# sampling
rating_sample = data_sampling.sample_df(ratings, user_thresh = 0, item_thresh = 20, user_sample_n = 6000, item_sample_n = 3100, random_seed = 42)


number of users: 6000
number of items: 3014
number of ratings: 250592


In [8]:
# Train test split
rating_train, rating_test = train_test_split(rating_sample, test_size=0.20, random_state=42)

In [17]:
tag_cos = SimRank.tag_simrank()
tag_jac = SimRank.tag_simrank()
wbs = SimRank.weighted_bipartite_simrank()
cf = BenchMark.cf_recommendation()

In [9]:
C_item = C_user = 0.9
tag_relevance_cut_off = 0.3
lbd = 0.3
k_neighbors = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
k_matric = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [10]:
user_grouped = evaluation.group(rating_test, 'user')

In [21]:
cf.fit(
    rating_train
)
tag_cos.fit(
    rating_train, 
    tags, 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
tag_jac.fit(
    rating_train, 
    tags[tags.relevance > 0.3], 
    C_item = C_item, 
    C_user = C_user, 
    lbd = 0.3
)
wbs.fit(
    rating_train, 
    C_item = C_item, 
    C_user = C_user
)

S_user, S_item = tag_cos._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
tag_cos.S_item = pd.DataFrame(S_item, index = tag_cos.items, columns = tag_cos.items)
tag_cos.S_user = pd.DataFrame(S_user, index = tag_cos.users, columns = tag_cos.users)

tag_jac.S_tag_based = tag_jac._cal_tab_based_S(tags[tags.relevance > tag_relevance_cut_off], how = 'jac', GPU = False)

S_user, S_item = tag_jac._cal_S(C_user, C_item, lbd, 100, 1e-4, GPU = False)
tag_jac.S_item = pd.DataFrame(S_item, index = tag_jac.items, columns = tag_jac.items)
tag_jac.S_user = pd.DataFrame(S_user, index = tag_jac.users, columns = tag_jac.users)

User count: 5998, item count: 2969
Initializing tab-based item similarity matrix...
Finished in 1277.3666360378265s!
Initializing user-W matrix...
Finished in 5.604381084442139s!
Initializing item-W matrix...
Finished in 2.7789549827575684s!
Initializing user evidence matrix...
Finished in 2021.7368178367615s!
Initializing item evidence matrix...
Finished in 1570.5498390197754s!
Iteration 1 / 100 start:
GPU failed, trying with CPU...
S_user updated in 5.6566009521484375, S_item updated in 6.135362148284912!
Iteration 2 / 100 start:
GPU failed, trying with CPU...
S_user updated in 5.294049024581909, S_item updated in 5.573145151138306!
Iteration 3 / 100 start:
GPU failed, trying with CPU...
S_user updated in 5.373220205307007, S_item updated in 5.411027908325195!
Iteration 4 / 100 start:
GPU failed, trying with CPU...
S_user updated in 6.1851770877838135, S_item updated in 8.202399969100952!
Converged at iteration 4, break!
User count: 5998, item count: 2969
Initializing tab-based item 

In [24]:
records = []

In [23]:
pred_tag_cos = tag_cos.cf_recommendation(rating_test, k = 100)
pred_tag_jac = tag_jac.cf_recommendation(rating_test, k = 100)
pred_wbs = wbs.cf_recommendation(rating_test, k = 100)
pred_cf = cf.predict(rating_test, k = 100)



Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 completed
5567/5567 completed
Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 completed
5567/5567 completed
Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 completed
5567/5567 completed
Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 comple

NameError: name 'records' is not defined

In [30]:
for n in k_matric:
    precision_tag_cos, recall_tag_cos, ndcg_tag_cos = evaluation.evaluation_at_k(
        rating_test,
        pred_tag_cos,
        user_grouped = user_grouped,
        k = n
    )
    
    group_ndcg_tag_cos = evaluation.ndcg_over_user_group(
        rating_test,
        pred_tag_cos,
        user_grouped,
        k = n,
        plot = False
    )
    
    precision_tag_jac, recall_tag_jac, ndcg_tag_jac = evaluation.evaluation_at_k(
        rating_test,
        pred_tag_jac,
        user_grouped = user_grouped,
        k = n
    )
    
    group_ndcg_tag_jac = evaluation.ndcg_over_user_group(
        rating_test,
        pred_tag_jac,
        user_grouped,
        k = n,
        plot = False
    )
        
    precision_wbs, recall_wbs, ndcg_wbs = evaluation.evaluation_at_k(
        rating_test,
        pred_wbs,
        user_grouped = user_grouped,
        k = n
    )
    
    group_ndcg_wbs = evaluation.ndcg_over_user_group(
        rating_test,
        pred_wbs,
        user_grouped,
        k = n,
        plot = False
    )
        
    precision_cf, recall_cf, ndcg_cf = evaluation.evaluation_at_k(
        rating_test,
        pred_cf,
        user_grouped = user_grouped,
        k = n
    )
    
    group_ndcg_cf = evaluation.ndcg_over_user_group(
        rating_test,
        pred_cf,
        user_grouped,
        k = n,
        plot = False
    )

    records.append({
        'lbd': lbd,
        'cutoff': tag_relevance_cut_off, 
        'k_neighbors': 100,
        'k_matric': n,
        'precision_tag_cos': precision_tag_cos,
        'recall_tag_cos': recall_tag_cos,
        'ndcg_tag_cos': ndcg_tag_cos,
        'precision_tag_jac': precision_tag_jac,
        'recall_tag_jac': recall_tag_jac,
        'ndcg_tag_jac': ndcg_tag_jac,
        'precision_wbs': precision_wbs,
        'recall_wbs': recall_wbs,
        'ndcg_wbs': ndcg_wbs,
        'precision_cf': precision_cf,
        'recall_cf': recall_cf,
        'ndcg_cf': ndcg_cf,
        'group_ndcg_tag_cos': group_ndcg_tag_cos,
        'group_ndcg_tag_jac': group_ndcg_tag_jac,
        'group_ndcg_wbs': group_ndcg_wbs,
        'group_ndcg_cf': group_ndcg_cf
    })
    with open('ndcg_rst.pkl', 'wb') as f:
        pkl.dump(records, f)

nDCG for each user group:  {'0-10': 0.22061953180420904, '10-50': 0.498793242156074, '50-100': 0.6551724137931034, '100-500': 0.7}
nDCG for each user group:  {'0-10': 0.2281863324663041, '10-50': 0.498793242156074, '50-100': 0.632183908045977, '100-500': 0.7}
nDCG for each user group:  {'0-10': 0.21896429415937574, '10-50': 0.4770716009654063, '50-100': 0.4942528735632184, '100-500': 0.6}
nDCG for each user group:  {'0-10': 0.23835422085599434, '10-50': 0.6025744167337088, '50-100': 0.8160919540229885, '100-500': 1.0}
nDCG for each user group:  {'0-10': 0.19676707181149464, '10-50': 0.43810434641131485, '50-100': 0.601813405898684, '100-500': 0.7773705614469083}
nDCG for each user group:  {'0-10': 0.20281687783477037, '10-50': 0.44650742428366624, '50-100': 0.5877180681339607, '100-500': 0.7386852807234542}
nDCG for each user group:  {'0-10': 0.19217478885879571, '10-50': 0.40611227670999556, '50-100': 0.4720199536072103, '100-500': 0.5226294385530916}
nDCG for each user group:  {'0-10

# Pearson

In [11]:
pearson_cf = BenchMark.cf_recommendation()

In [17]:
pearson_cf.fit(
    rating_train,
    how = 'pearson'
)

In [14]:
pearson_cf_records = []

In [18]:
for k in k_neighbors:
    pred_pearson_cf = pearson_cf.predict(rating_test, k = k)
            
    for n in k_matric:    
        precision_pearson_cf, recall_pearson_cf, ndcg_pearson_cf = evaluation.evaluation_at_k(
            rating_test,
            pred_pearson_cf,
            user_grouped = user_grouped,
            k = n
        )
        
        group_precision_cf = evaluation.precision_over_user_group(
            rating_test,
            pred_pearson_cf,
            user_grouped,
            k = n,
            plot = False
    )
        group_recall_cf = evaluation.recall_over_user_group(
            rating_test,
            pred_pearson_cf,
            user_grouped,
            k = n,
            plot = False
    )
                
        group_ndcg_cf = evaluation.ndcg_over_user_group(
            rating_test,
            pred_pearson_cf,
            user_grouped,
            k = n,
            plot = False
        )
            
        pearson_cf_records.append({
        'k_neighbors': k,
        'k_matric': n,
        'precision_cf': precision_pearson_cf,
        'recall_cf': recall_pearson_cf,
        'ndcg_cf': ndcg_pearson_cf,
        'group_precision_cf': group_precision_cf,
        'group_recall_cf': group_recall_cf,
        'group_ndcg_cf': group_ndcg_cf
    })
    with open('pearson_rst.pkl', 'wb') as f:
        pkl.dump(pearson_cf_records, f)

Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 completed
5567/5567 completed
Precision for each user group:  {'0-10': 0.1797, '10-50': 0.5302, '50-100': 0.8276, '100-500': 0.9}
Recall for each user group:  {'0-10': 0.0477, '10-50': 0.0282, '50-100': 0.0128, '100-500': 0.0073}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])


nDCG for each user group:  {'0-10': 0.17971151572475763, '10-50': 0.5301689460981497, '50-100': 0.8275862068965517, '100-500': 0.9}
Precision for each user group:  {'0-10': 0.1485, '10-50': 0.4831, '50-100': 0.7874, '100-500': 0.85}
Recall for each user group:  {'0-10': 0.0777, '10-50': 0.0512, '50-100': 0.0244, '100-500': 0.0141}
nDCG for each user group:  {'0-10': 0.15959543283931096, '10-50': 0.4937556086512942, '50-100': 0.7964601189581404, '100-500': 0.8613147192765458}
Precision for each user group:  {'0-10': 0.129, '10-50': 0.4363, '50-100': 0.7586, '100-500': 0.8333}
Recall for each user group:  {'0-10': 0.1001, '10-50': 0.0685, '50-100': 0.0354, '100-500': 0.0209}
nDCG for each user group:  {'0-10': 0.15189208251604325, '10-50': 0.4583165284274679, '50-100': 0.7740964785070551, '100-500': 0.8469278726022755}
Precision for each user group:  {'0-10': 0.114, '10-50': 0.4105, '50-100': 0.7443, '100-500': 0.85}
Recall for each user group:  {'0-10': 0.117, '10-50': 0.0853, '50-100':

Precision for each user group:  {'0-10': 0.1345, '10-50': 0.4505, '50-100': 0.75, '100-500': 0.875}
Recall for each user group:  {'0-10': 0.1391, '10-50': 0.0935, '50-100': 0.0462, '100-500': 0.0292}
nDCG for each user group:  {'0-10': 0.17716411388240275, '10-50': 0.4802430704322981, '50-100': 0.7724970170093886, '100-500': 0.8831872463728884}
Precision for each user group:  {'0-10': 0.1209, '10-50': 0.4198, '50-100': 0.7287, '100-500': 0.86}
Recall for each user group:  {'0-10': 0.1558, '10-50': 0.1084, '50-100': 0.056, '100-500': 0.0356}
nDCG for each user group:  {'0-10': 0.17635984466194712, '10-50': 0.4561825992383935, '50-100': 0.7555953290110687, '100-500': 0.8722726572644952}
Precision for each user group:  {'0-10': 0.1105, '10-50': 0.3978, '50-100': 0.7088, '100-500': 0.8333}
Recall for each user group:  {'0-10': 0.1696, '10-50': 0.1225, '50-100': 0.0655, '100-500': 0.0415}
nDCG for each user group:  {'0-10': 0.17735629337954287, '10-50': 0.4380557265317679, '50-100': 0.73981

Recall for each user group:  {'0-10': 0.1788, '10-50': 0.1245, '50-100': 0.0659, '100-500': 0.0409}
nDCG for each user group:  {'0-10': 0.18709199455217432, '10-50': 0.443125984976609, '50-100': 0.7435243675722891, '100-500': 0.8540927724698012}
Precision for each user group:  {'0-10': 0.1082, '10-50': 0.3798, '50-100': 0.6995, '100-500': 0.8143}
Recall for each user group:  {'0-10': 0.1924, '10-50': 0.1364, '50-100': 0.0756, '100-500': 0.0473}
nDCG for each user group:  {'0-10': 0.18956642925226141, '10-50': 0.42493316704005113, '50-100': 0.7333227484990942, '100-500': 0.8491364981585792}
Precision for each user group:  {'0-10': 0.1014, '10-50': 0.3648, '50-100': 0.6753, '100-500': 0.7875}
Recall for each user group:  {'0-10': 0.205, '10-50': 0.1497, '50-100': 0.0836, '100-500': 0.0523}
nDCG for each user group:  {'0-10': 0.19306423638210024, '10-50': 0.41176084596020956, '50-100': 0.7151634567485698, '100-500': 0.8292567662489511}
Precision for each user group:  {'0-10': 0.0953, '10-

nDCG for each user group:  {'0-10': 0.19664837726423495, '10-50': 0.415452232023509, '50-100': 0.7125184258277848, '100-500': 0.8276298882426483}
Precision for each user group:  {'0-10': 0.0965, '10-50': 0.3512, '50-100': 0.6628, '100-500': 0.7889}
Recall for each user group:  {'0-10': 0.2208, '10-50': 0.1617, '50-100': 0.0921, '100-500': 0.0594}
nDCG for each user group:  {'0-10': 0.19999254401876923, '10-50': 0.4010843654919628, '50-100': 0.7003280190620562, '100-500': 0.8327504905535094}
Precision for each user group:  {'0-10': 0.0907, '10-50': 0.3368, '50-100': 0.6494, '100-500': 0.79}
Recall for each user group:  {'0-10': 0.2296, '10-50': 0.1719, '50-100': 0.1002, '100-500': 0.066}
nDCG for each user group:  {'0-10': 0.20333700800876792, '10-50': 0.3887211590219747, '50-100': 0.689411176159929, '100-500': 0.8306668785305928}
Count of cold start user: 2
Count of cold start item: 45
557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
33

557/5567 completed
1114/5567 completed
1671/5567 completed
2227/5567 completed
2784/5567 completed
3341/5567 completed
3897/5567 completed
4454/5567 completed
5011/5567 completed
5567/5567 completed
Precision for each user group:  {'0-10': 0.236, '10-50': 0.6026, '50-100': 0.8276, '100-500': 1.0}
Recall for each user group:  {'0-10': 0.0644, '10-50': 0.0323, '50-100': 0.0128, '100-500': 0.0082}
nDCG for each user group:  {'0-10': 0.2359895956490896, '10-50': 0.6025744167337088, '50-100': 0.8275862068965517, '100-500': 1.0}
Precision for each user group:  {'0-10': 0.1927, '10-50': 0.5394, '50-100': 0.8218, '100-500': 0.95}
Recall for each user group:  {'0-10': 0.104, '10-50': 0.057, '50-100': 0.0255, '100-500': 0.0155}
nDCG for each user group:  {'0-10': 0.20867518711891211, '10-50': 0.5537120750315181, '50-100': 0.82313962290535, '100-500': 0.9613147192765459}
Precision for each user group:  {'0-10': 0.1651, '10-50': 0.4883, '50-100': 0.8008, '100-500': 0.8667}
Recall for each user gro

In [19]:
pearson_cf_records

[{'k_neighbors': 10,
  'k_matric': 1,
  'precision_cf': 0.2694449434165619,
  'recall_cf': 0.04276694585791592,
  'ndcg_cf': 0.2694449434165619,
  'group_precision_cf': {'0-10': 0.1797,
   '10-50': 0.5302,
   '50-100': 0.8276,
   '100-500': 0.9},
  'group_recall_cf': {'0-10': 0.0477,
   '10-50': 0.0282,
   '50-100': 0.0128,
   '100-500': 0.0073},
  'group_ndcg_cf': {'0-10': 0.17971151572475763,
   '10-50': 0.5301689460981497,
   '50-100': 0.8275862068965517,
   '100-500': 0.9}},
 {'k_neighbors': 10,
  'k_matric': 2,
  'precision_cf': 0.23450691575354768,
  'recall_cf': 0.07084710831766425,
  'ndcg_cf': 0.24547736385362465,
  'group_precision_cf': {'0-10': 0.1485,
   '10-50': 0.4831,
   '50-100': 0.7874,
   '100-500': 0.85},
  'group_recall_cf': {'0-10': 0.0777,
   '10-50': 0.0512,
   '50-100': 0.0244,
   '100-500': 0.0141},
  'group_ndcg_cf': {'0-10': 0.15959543283931096,
   '10-50': 0.4937556086512942,
   '50-100': 0.7964601189581404,
   '100-500': 0.8613147192765458}},
 {'k_neighbors