### Data: HetRec 2011 MovieLens + IMDb/Rotten Tomatoes
https://grouplens.org/datasets/hetrec-2011/
readme: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-readme.txt

In [1]:
from freshgraph import data_loader
from freshgraph.data_loader import user_item_tuple_train, movie_meta, movies_validate, movie_genres, movie_tags

## load data into graph structure

In [None]:
%%time
movie_feature_graph = data_loader.get_graph()

### Step 1: candidating

In [2]:
import warnings
warnings.filterwarnings('ignore')
## tensorflow warning supression
from freshgraph import candidating

#### create metapath2vec based model for item to item similarity

##### Step 1: training similarity model

In [None]:
%%time
# training will take about 20+- mins, once trained this step can be skipped
metapath2vec_model = candidating.train_candidates_model(movie_feature_graph)
metapath2vec_model.save("movie.model")

##### step 1: load trained model

In [3]:
%%time
from gensim.models import Word2Vec
metapath2vec_model = Word2Vec.load("movie.model")

CPU times: user 245 ms, sys: 38.7 ms, total: 284 ms
Wall time: 281 ms


In [None]:
movies_validate[:10]

In [4]:
validate_id = 3884 #, Star Trek, 25940	The Lady from Shanghai	

similar_items = candidating.get_similar_items(metapath2vec_model, validate_id)
ordered_item_list = candidating.sort_similar_item_by_score(similar_items)
for x in ordered_item_list[:10]:
    print(movie_meta[movie_meta['id'] == int(x[0])]['title'].values)
    print(x[1])

['Star Trek IV: The Voyage Home']
0.5979886651039124
['Star Trek III: The Search for Spock']
0.5717388987541199
['Star Trek V: The Final Frontier']
0.5665578842163086
['Mission: Impossible III']
0.5640816688537598
['Titanica']
0.5558382272720337
['Star Trek: The Wrath of Khan']
0.5479453206062317
['Star Trek VI: The Undiscovered Country']
0.5397538542747498
['Mission: Impossible III']
0.5361419916152954
['Destiny in Space']
0.5297540426254272
['Bottle Shock']
0.5221686363220215


In [5]:
ranked_users = candidating.get_ranked_candidates(user_item_tuple_train, similar_items)
candidate_ids = [x[0] for x in ranked_users]

In [6]:
user_item_tuple_train[:5]

Unnamed: 0,userID,movieID
1,75,32
2,75,110
4,75,163
5,75,165
6,75,173


### Step 2: drift detection

#### create user-items and item-users mapping

In [7]:
%%time
from collections import defaultdict

# user-items tuples based on candidate
candidate_items_tuple = user_item_tuple_train[user_item_tuple_train['userID'].isin([int(x) for x in candidate_ids])]

# user-items tuples based on all positive pairs
# candidate_items_tuple = user_item_tuple_train

user_items_dict = defaultdict(list)
candidate_items_subgraph = [user_items_dict[x['userID']].append(x['movieID']) for _, x in candidate_items_tuple.iterrows()]

item_users_dict = defaultdict(list)
item_candidates_subgraph = [item_users_dict[x['movieID']].append(x['userID']) for _, x in candidate_items_tuple.iterrows()]


def partition_vertices(user_id, partition_user_dict, partition_item_dict, partition_index):
    partition_user_dict[user_id] = partition_index
    grouped_candidate_list = []
    for item_id in user_items_dict[user_id]:
        if item_id not in partition_item_dict:
            partition_item_dict[item_id]=partition_index
            grouped_candidate_list += item_users_dict[item_id]
#     print(grouped_candidate_list)
    for related_user_id in set(grouped_candidate_list):
        if related_user_id not in partition_user_dict:
            partition_vertices(related_user_id, partition_user_dict, partition_item_dict, partition_index)

CPU times: user 1min 6s, sys: 24.1 ms, total: 1min 6s
Wall time: 1min 6s


In [8]:
print(len(user_items_dict))
print(len(item_users_dict))

1067
9096


#### create partition on user and item nodes

In [11]:
import itertools
partition_user_dict={}
partition_item_dict={}
partition_count = 0

    
for user_id in candidate_ids:
    if user_id not in partition_user_dict:
        partition_vertices(user_id, partition_user_dict, partition_item_dict, partition_count)
        partition_count += 1



In [12]:
partition_count

1