In [42]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections
from importlib import reload
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [43]:
from surprise import Reader, Dataset, SVD, NormalPredictor
# from surprise import evaluate
import surprise.model_selection

from sklearn.model_selection import cross_val_score
import sklearn.metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [44]:
import util.plot
import util.data
from util import clustering

In [45]:
reload(util.plot)
reload(util.data);

In [46]:
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=50*1000)
util.data.rm_na(data_all)
data_all.shape

(50000, 85)

In [47]:
data, data_test = util.data.train_test_split(data_all)

# split cross validation folds
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=3, resampling_ratio=0)

In [48]:
def predict(model, scores: pd.DataFrame):
    # Return a dict {user: {item: predicted score}}
    results = collections.defaultdict(dict)
    for _, row in scores.iterrows():
        item = row['item']
        user = row['user']
        result = model.predict(str(item), str(user), verbose=0)
        results[user][item] = result.est
    return results

In [49]:
# suppress warning to improve speed
pd.options.mode.chained_assignment = None  # default='warn'
reload(util.data)
reload(util.clustering)
# cluster_id_items_KMeans

cv_results = collections.defaultdict(list)
for i_train, i_test in folds:
    # Cluser users & items
    keys_search, keys_property, models_user, models_item = clustering.init(data_all)
#     clustering.init_df_columns(data_all, models_user, models_item)
    xy_train = data_all.loc[i_train]
    clustering.fit(xy_train, models_user, keys_search,'srch_id')
    clustering.fit(xy_train, models_item, keys_property,'prop_id')
    
    # predict train+test data
    users = clustering.predict(data_all, models_user, keys_search,
                 'srch_id', clustering.USER_KEY_PREFIX)
    items = clustering.predict(data_all, models_item, keys_property,
                 'prop_id', clustering.ITEM_KEY_PREFIX)

    for k in users.columns:
        util.data.replace_extremely_uncommon(users, k)
        data_all.loc[users.index, k] = users[k]
    for k in items.columns:
        util.data.replace_extremely_uncommon(items, k)
        data_all.loc[items.index, k] = items[k]
    
    assert not items.isna().any().any()
    xy_train = data_all.loc[i_train]
    xy_test = data_all.loc[i_test]

    # train SVD's

    # check all combinations (of all user/item models)
    for k_user in users.columns:
        for k_item in items.columns:
            print(k_user, k_item)
            assert not data_all[k_user].isna().any()
            assert not data_all[k_item].isna().any()
            
            scores_train = util.data.scores_df(xy_train, k_user, k_item)
            scores_test = util.data.scores_df(xy_test, k_user, k_item)

            # check minimal occurence
            value_counts_user = scores_train.user.value_counts(ascending=True)
            value_counts_item = scores_train.item.value_counts(ascending=True)

            scores_train_ = Dataset.load_from_df(scores_train, Reader(rating_scale=(0,5)))
            model = SVD() # SVDpp NMF
            trainset, _ = surprise.model_selection.train_test_split(scores_train_, test_size=0.01, random_state=seed)
            model.fit(trainset)
            scores_pred = clustering.svd_predict(model, scores_test)

            # scores_pred
            for i, row in xy_test.iterrows():
                score_pred = scores_pred[row[k_user]][row[k_item]]
                # add squared error
                cv_results[k_user + '-' + k_item].append((score_pred - row['score'])**2)

	extract_data(k: srch_id)
	extract_data(k: prop_id)
	KMeans (k: `cluster_id_users_KMeans`)
	FeatureAgglomeration (k: `cluster_id_users_FeatureAgglomeration`)
	KMeans (k: `cluster_id_items_KMeans`)
	FeatureAgglomeration (k: `cluster_id_items_FeatureAgglomeration`)
cluster_id_users_KMeans cluster_id_items_KMeans
cluster_id_users_KMeans cluster_id_items_FeatureAgglomeration
cluster_id_users_FeatureAgglomeration cluster_id_items_KMeans
cluster_id_users_FeatureAgglomeration cluster_id_items_FeatureAgglomeration
	extract_data(k: srch_id)
	extract_data(k: prop_id)
	KMeans (k: `cluster_id_users_KMeans`)
	FeatureAgglomeration (k: `cluster_id_users_FeatureAgglomeration`)
	KMeans (k: `cluster_id_items_KMeans`)
	FeatureAgglomeration (k: `cluster_id_items_FeatureAgglomeration`)
cluster_id_users_KMeans cluster_id_items_KMeans
cluster_id_users_KMeans cluster_id_items_FeatureAgglomeration
cluster_id_users_FeatureAgglomeration cluster_id_items_KMeans
cluster_id_users_FeatureAgglomeration cluster_id_ite

In [50]:
print('mse')
for k, values in cv_results.items():
    print('%s & %0.4f & %0.4f' % (k,np.mean(values), np.std(values)))

print('median')
for k, values in cv_results.items():    
    print('%s & %0.4f & %0.4f' % (k,np.median(values), np.std(values)))

mse
cluster_id_users_KMeans-cluster_id_items_KMeans & 5.8631 & 7.6999
cluster_id_users_KMeans-cluster_id_items_FeatureAgglomeration & 6.5585 & 8.9001
cluster_id_users_FeatureAgglomeration-cluster_id_items_KMeans & 5.6798 & 7.3421
cluster_id_users_FeatureAgglomeration-cluster_id_items_FeatureAgglomeration & 6.4911 & 8.7952
median
cluster_id_users_KMeans-cluster_id_items_KMeans & 0.8239 & 7.6999
cluster_id_users_KMeans-cluster_id_items_FeatureAgglomeration & 0.3787 & 8.9001
cluster_id_users_FeatureAgglomeration-cluster_id_items_KMeans & 1.0851 & 7.3421
cluster_id_users_FeatureAgglomeration-cluster_id_items_FeatureAgglomeration & 0.3787 & 8.7952


In [None]:
# Netflix usage of SVD: predict score for known user
# Here: predict similarity group of user