In [179]:
import os
import networkx as nx
import community as community_louvain
import random
import math
random.seed(42)
os.getcwd()

'/home/tduricic/Development/workspace/original-repos/GraphRec-WWW19'

In [73]:
datasets = ["epinions", "ciao", "lastfm"]
data = {}

def get_user_item_ratings(train_ratings_path, test_ratings_path):
    user_ids_train = []
    item_ids_train = []
    ratings_train = []
    user_ids_test = []
    item_ids_test = []
    ratings_test = []
    user_ids_all = []
    item_ids_all = []
    ratings_all = []
    with open(train_ratings_path, "r") as fp:
        for line in fp.readlines():
            tokens = line.split()
            user_id = int(tokens[0])
            item_id = int(tokens[1])
            rating = int(tokens[2])

            user_ids_train.append(user_id)
            item_ids_train.append(item_id)
            ratings_train.append(rating)

            user_ids_all.append(user_id)
            item_ids_all.append(item_id)
            ratings_all.append(rating)

    with open(test_ratings_path, "r") as fp:
        for line in fp.readlines():
            tokens = line.split()
            user_id = int(tokens[0])
            item_id = int(tokens[1])
            rating = int(tokens[2])

            user_ids_test.append(user_id)
            item_ids_test.append(item_id)
            ratings_test.append(rating)

            user_ids_all.append(user_id)
            item_ids_all.append(item_id)
            ratings_all.append(rating)

    return user_ids_train, item_ids_train, ratings_train, user_ids_test, item_ids_test, ratings_test, user_ids_all, item_ids_all, ratings_all

def get_social_connections_graph(social_connections_path):
    source_ids = []
    target_ids = []
    G = nx.Graph()
    with open(social_connections_path, "r") as fp:
        for line in fp.readlines():
            tokens = line.split()
            source_id = int(tokens[0])
            target_id = int(tokens[1])
            source_ids.append(source_id)
            target_ids.append(target_id)
            G.add_edge(source_id, target_id)
    return source_ids, target_ids, G

def get_users_in_train_but_not_in_test(user_ids_train, user_ids_test):
    return list(set(user_ids_train).difference(user_ids_test))

def get_users_in_test_but_not_in_train(user_ids_train, user_ids_test):
    return list(set(user_ids_test).difference(user_ids_train))

def get_items_in_train_but_not_in_test(item_ids_train, item_ids_test):
    return list(set(item_ids_train).difference(item_ids_test))

def get_items_in_test_but_not_in_train(item_ids_train, item_ids_test):
    return list(set(item_ids_test).difference(item_ids_train))

def get_users_in_ratings_but_not_in_social(user_ids_all, G):
    return list(set(user_ids_all).difference(G.nodes()))

def get_users_in_social_but_not_in_ratings(user_ids_all, G):
    return list(set(G.nodes()).difference(user_ids_all))

def get_component_sizes(G):
    return [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]

def get_communities(G):
    communities = {}
    graph_communities = community_louvain.best_partition(G)
    for user_id in graph_communities:
        community_id = graph_communities[user_id]
        if community_id not in communities:
            communities[community_id] = []
        communities[community_id].append(user_id)
    return communities

def get_item_ratings_dict(user_ids, item_ids, ratings):
    item_ratings_dict = {}
    for i in range(len(item_ids)):
        user_id = user_ids[i]
        item_id = item_ids[i]
        rating = ratings[i]
        if item_id not in item_ratings_dict:
            item_ratings_dict[item_id] = {"user_ids": [], "ratings": []}
        item_ratings_dict[item_id]["user_ids"].append(user_id)
        item_ratings_dict[item_id]["ratings"].append(rating)
    return item_ratings_dict

def get_user_ratings_dict(user_ids, item_ids, ratings):
    user_ratings_dict = {}
    for i in range(len(user_ids)):
        user_id = user_ids[i]
        item_id = item_ids[i]
        rating = ratings[i]
        if user_id not in user_ratings_dict:
            user_ratings_dict[user_id] = {"item_ids": [], "ratings": []}
        user_ratings_dict[user_id]["item_ids"].append(item_id)
        user_ratings_dict[user_id]["ratings"].append(rating)
    return user_ratings_dict


for dataset in datasets:
    print(dataset)
    data[dataset] = {}
    train_ratings_path = "./data/" + dataset + "/" + dataset + "_ratings_final_train.tsv"
    test_ratings_path = "./data/" + dataset + "/" + dataset + "_ratings_final_test.tsv"
    social_connections_path = "./data/" + dataset + "/" + dataset + "_social_connections_final.tsv"
    user_ids_train, item_ids_train, ratings_train, user_ids_test, item_ids_test, ratings_test, user_ids_all, item_ids_all, ratings_all = get_user_item_ratings(train_ratings_path, test_ratings_path)
    source_ids, target_ids, G = get_social_connections_graph(social_connections_path)
    data[dataset]["user_ids_train"] = user_ids_train
    data[dataset]["item_ids_train"] = item_ids_train
    data[dataset]["ratings_train"] = ratings_train
    data[dataset]["user_ids_test"] = user_ids_test
    data[dataset]["item_ids_test"] = item_ids_test
    data[dataset]["ratings_test"] = ratings_test
    data[dataset]["user_ids_all"] = user_ids_all
    data[dataset]["item_ids_all"] = item_ids_all
    data[dataset]["ratings_all"] = ratings_all
    data[dataset]["source_ids"] = source_ids
    data[dataset]["target_ids"] = target_ids
    data[dataset]["graph"] = G
    data[dataset]["all_user_ratings_dict"] = get_user_ratings_dict(user_ids_all, item_ids_all, ratings_all)
    data[dataset]["all_item_ratings_dict"] = get_item_ratings_dict(user_ids_all, item_ids_all, ratings_all)

    data[dataset]["users_in_train_but_not_in_test"] = get_users_in_train_but_not_in_test(list(set(user_ids_train)), list(set(user_ids_test)))
    print("users_in_train_but_not_in_test: " + str(len(data[dataset]["users_in_train_but_not_in_test"])))

    data[dataset]["users_in_test_but_not_in_train"] = get_users_in_test_but_not_in_train(list(set(user_ids_train)), list(set(user_ids_test)))
    print("users_in_test_but_not_in_train: " + str(len(data[dataset]["users_in_test_but_not_in_train"])))

    data[dataset]["items_in_train_but_not_in_test"] = get_items_in_train_but_not_in_test(list(set(item_ids_train)), list(set(item_ids_test)))
    print("items_in_train_but_not_in_test: " + str(len(data[dataset]["items_in_train_but_not_in_test"])))

    data[dataset]["items_in_test_but_not_in_train"] = get_items_in_test_but_not_in_train(list(set(item_ids_train)), list(set(item_ids_test)))
    print("items_in_test_but_not_in_train: " + str(len(data[dataset]["items_in_test_but_not_in_train"])))

    data[dataset]["users_in_ratings_but_not_in_social"] = get_users_in_ratings_but_not_in_social(list(set(user_ids_all)), G)
    print("users_in_ratings_but_not_in_social: " + str(len(data[dataset]["users_in_ratings_but_not_in_social"])))

    data[dataset]["users_in_social_but_not_in_ratings"] = get_users_in_social_but_not_in_ratings(list(set(user_ids_all)), G)
    print("users_in_social_but_not_in_ratings: " + str(len(data[dataset]["users_in_social_but_not_in_ratings"])))

    data[dataset]["component_sizes"] = get_component_sizes(G)
    print("component_sizes: " + str(data[dataset]["component_sizes"]))

    data[dataset]["communities"] = get_communities(data[dataset]["graph"])

epinions
users_in_train_but_not_in_test: 519
users_in_test_but_not_in_train: 1
items_in_train_but_not_in_test: 178612
items_in_test_but_not_in_train: 36492
users_in_ratings_but_not_in_social: 0
users_in_social_but_not_in_ratings: 1
component_sizes: [17990, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
ciao
users_in_train_but_not_in_test: 453
users_in_test_but_not_in_train: 0
items_in_train_but_not_in_test: 70874
items_in_test_but_not_in_train: 14329
users_in_ratings_but_not_in_social: 0
users_in_social_but_not_in_ratings: 0
component_sizes: [7305, 2, 2, 2, 2, 2, 2]
lastfm
users_in_train_but_not_in_test: 3
users_in_test_but_not_in_train: 0
items_in_train_but_not_in_test: 83174
items_in_test_but_not_in_train: 3338
users_in_ratings_but_not_in_social: 0
users_in_social_but_not_in_ratings: 0
component_sizes: [3302]


In [77]:
len(data["lastfm"]["communities"])

6

In [75]:
for community_id in data["epinions"]["communities"]:
    print(str(community_id) + " : " + str(len(data["epinions"]["communities"][community_id])))

0 : 1278
1 : 5253
2 : 3854
3 : 4346
4 : 2867
5 : 3
6 : 6
7 : 31
8 : 64
9 : 6
10 : 34
11 : 2
12 : 3
13 : 3
14 : 4
15 : 4
17 : 26
37 : 58
19 : 3
20 : 3
21 : 2
23 : 3
24 : 6
25 : 2
27 : 5
28 : 5
29 : 4
30 : 2
31 : 4
32 : 4
34 : 3
35 : 3
36 : 3
38 : 4
39 : 3
40 : 4
41 : 2
42 : 3
43 : 5
44 : 4
45 : 4
46 : 3
47 : 2
48 : 4
49 : 3
51 : 4
52 : 4
53 : 2
54 : 3
55 : 6
56 : 4
57 : 3
58 : 2
59 : 2
60 : 2
61 : 2
62 : 3
63 : 3
64 : 5
65 : 2
66 : 2
67 : 4
68 : 2
69 : 3
70 : 2
71 : 2
72 : 2
73 : 2
74 : 3
75 : 2
76 : 3
77 : 2
78 : 2
79 : 2
80 : 2
81 : 2
82 : 2
83 : 2
84 : 2
85 : 2
86 : 2
87 : 2
88 : 2
89 : 2
90 : 2
91 : 2
92 : 4
93 : 2
94 : 2
95 : 2
96 : 2
97 : 2
98 : 2
99 : 2
33 : 2
16 : 2
18 : 2
50 : 2
22 : 2
26 : 2


In [34]:
for community_id in data["ciao"]["communities"]:
    print(str(community_id) + " : " + str(len(data["ciao"]["communities"][community_id])))

0 : 1360
1 : 2150
2 : 1129
4 : 1650
16 : 992
11 : 8
5 : 4
6 : 3
8 : 3
9 : 3
7 : 2
10 : 3
15 : 2
3 : 2
12 : 2
13 : 2
14 : 2


In [35]:
for community_id in data["lastfm"]["communities"]:
    print(str(community_id) + " : " + str(len(data["lastfm"]["communities"][community_id])))

0 : 565
1 : 1408
2 : 621
3 : 391
4 : 138
5 : 87
6 : 92


In [44]:
len(data["epinions"]["all_user_ratings_dict"])

{7272: {'item_ids': [2454,
   124906,
   59105,
   26761,
   11653,
   18038,
   109207,
   124905,
   124908,
   6644,
   5827,
   124907,
   29693,
   24398,
   124904,
   43915,
   24276],
  'ratings': [2, 4, 3, 2, 1, 3, 1, 2, 2, 2, 3, 2, 1, 0, 3, 0, 3]},
 17815: {'item_ids': [70153,
   37505,
   7533,
   201,
   2406,
   2399,
   4393,
   1699,
   6684,
   52069,
   37405,
   130168,
   4128,
   3247,
   16156,
   85257,
   2386,
   42825,
   36478,
   42875,
   90470,
   1658,
   105,
   4605,
   854,
   1655,
   137,
   26652,
   352,
   96778,
   829,
   4394,
   52534,
   469,
   7708,
   859,
   21717,
   793,
   10325,
   17,
   16215,
   96862,
   24896],
  'ratings': [4,
   4,
   3,
   4,
   4,
   4,
   4,
   3,
   3,
   4,
   3,
   4,
   4,
   3,
   4,
   4,
   4,
   4,
   2,
   4,
   4,
   4,
   2,
   3,
   4,
   4,
   4,
   3,
   4,
   4,
   3,
   2,
   1,
   4,
   4,
   4,
   4,
   4,
   3,
   4,
   4,
   4,
   0]},
 192: {'item_ids': [12717,
   3704,
   12329,
   12233

In [47]:
print(len(data["epinions"]["all_user_ratings_dict"]))
print(len(data["ciao"]["all_user_ratings_dict"]))
print(len(data["lastfm"]["all_user_ratings_dict"]))

18068
7317
3302


In [48]:
print(len(data["epinions"]["all_item_ratings_dict"]))
print(len(data["ciao"]["all_item_ratings_dict"]))
print(len(data["lastfm"]["all_item_ratings_dict"]))

261246
104975
252009


In [92]:
def get_items_rated_by_community(data, dataset):
    items_rated_by_community = {}
    unique_items_rated_by_community_count = {}
    community_social_densities = {}
    community_rating_densities = {}
    for community_id in data[dataset]["communities"]:
        items_rated_by_community[community_id] = []
        for user_id in data[dataset]["communities"][community_id]:
            if user_id not in data[dataset]["all_user_ratings_dict"]:
                print(user_id)
                continue
            items_rated_by_community[community_id] += data[dataset]["all_user_ratings_dict"][user_id]["item_ids"]
    for community_id in data[dataset]["communities"]:
        unique_items_rated_by_community_count[community_id] = len(set(items_rated_by_community[community_id]))

    for community_id in data[dataset]["communities"]:
        G = data[dataset]["graph"].subgraph(data[dataset]["communities"][community_id])
        community_social_densities[community_id] = len(G.edges) / ((len(G.nodes) * (len(G.nodes)-1)) / 2)
    return items_rated_by_community, unique_items_rated_by_community_count, community_social_densities

In [86]:
data["epinions"]["all_user_ratings_dict"][0]

{'item_ids': [154095,
  4116,
  1878,
  89433,
  199863,
  553,
  3233,
  37449,
  4747,
  199862,
  35791,
  601,
  8707,
  595,
  17374,
  5708,
  101967,
  1611,
  5635,
  3749,
  1887,
  11011,
  128537,
  308,
  43246,
  114404,
  35840,
  48292,
  59816,
  1648,
  3188,
  82583,
  104490,
  24201,
  4632,
  29250,
  112929,
  105,
  5564,
  21612,
  16094,
  41204,
  10926],
 'ratings': [4,
  3,
  3,
  4,
  4,
  1,
  3,
  3,
  4,
  4,
  4,
  1,
  4,
  3,
  0,
  4,
  4,
  4,
  3,
  4,
  3,
  3,
  4,
  4,
  4,
  3,
  4,
  3,
  3,
  4,
  2,
  1,
  3,
  4,
  4,
  4,
  4,
  2,
  4,
  3,
  3,
  4,
  4]}

In [115]:
# 1 i 3 i 4
dataset = "epinions"
items_rated_by_community, unique_items_rated_by_community_count, community_social_densities = get_items_rated_by_community(data, dataset)

original_social_density = len(data[dataset]["graph"].edges) / ((len(data[dataset]["graph"].nodes)*(len(data[dataset]["graph"].nodes)-1))/2)
original_rating_density = len(data[dataset]["ratings_all"]) / (len(set(data[dataset]["user_ids_all"])) * len(set(data[dataset]["item_ids_all"])))
print("orig_social_density: {0}, orig_ratings_density: {1}".format(round(original_social_density, 5), round(original_rating_density, 5)))

for community_id in data[dataset]["communities"]:
    num_users = len(data[dataset]["communities"][community_id])
    rating_density = len(items_rated_by_community[community_id]) / (num_users * len(items_rated_by_community[community_id]))
    num_edges = int((community_social_densities[community_id] * (num_users*(num_users-1))) / 2)
    if num_users > 100:
        print("community_id: {0}, num_users: {1}, num_items: {2}, num_ratings: {3}, num_edges: {4} social_density: {5}, ratings_density: {6}".format(community_id, num_users, unique_items_rated_by_community_count[community_id], len(items_rated_by_community[community_id]), num_edges, round(community_social_densities[community_id], 5), round(rating_density, 5)))

1894
orig_social_density: 0.00176, orig_ratings_density: 0.00016
community_id: 0, num_users: 1278, num_items: 29831, num_ratings: 46326, num_edges: 11633 social_density: 0.01426, ratings_density: 0.00078
community_id: 1, num_users: 5253, num_items: 106570, num_ratings: 219262, num_edges: 37237 social_density: 0.0027, ratings_density: 0.00019
community_id: 2, num_users: 3854, num_items: 102674, num_ratings: 186344, num_edges: 70900 social_density: 0.00955, ratings_density: 0.00026
community_id: 3, num_users: 4346, num_items: 90817, num_ratings: 181155, num_edges: 45249 social_density: 0.00479, ratings_density: 0.00023
community_id: 4, num_users: 2867, num_items: 64849, num_ratings: 113101, num_edges: 14345 social_density: 0.00349, ratings_density: 0.00035


In [118]:
# 1 i 10
dataset = "ciao"
items_rated_by_community, unique_items_rated_by_community_count, community_social_densities = get_items_rated_by_community(data, dataset)

original_social_density = len(data[dataset]["graph"].edges) / ((len(data[dataset]["graph"].nodes)*(len(data[dataset]["graph"].nodes)-1))/2)
original_rating_density = len(data[dataset]["ratings_all"]) / (len(set(data[dataset]["user_ids_all"])) * len(set(data[dataset]["item_ids_all"])))
print("orig_social_density: {0}, orig_ratings_density: {1}".format(round(original_social_density, 5), round(original_rating_density, 5)))

for community_id in data[dataset]["communities"]:
    num_users = len(data[dataset]["communities"][community_id])
    rating_density = len(items_rated_by_community[community_id]) / (num_users * len(items_rated_by_community[community_id]))
    num_edges = int((community_social_densities[community_id] * (num_users*(num_users-1))) / 2)
    if num_users > 100:
        print("community_id: {0}, num_users: {1}, num_items: {2}, num_ratings: {3}, num_edges: {4} social_density: {5}, ratings_density: {6}".format(community_id, num_users, unique_items_rated_by_community_count[community_id], len(items_rated_by_community[community_id]), num_edges, round(community_social_densities[community_id], 5), round(rating_density, 5)))

orig_social_density: 0.00318, orig_ratings_density: 0.00037
community_id: 11, num_users: 1253, num_items: 20953, num_ratings: 41186, num_edges: 10694 social_density: 0.01363, ratings_density: 0.0008
community_id: 1, num_users: 2129, num_items: 28222, num_ratings: 85296, num_edges: 16587 social_density: 0.00732, ratings_density: 0.00047
community_id: 9, num_users: 1460, num_items: 51474, num_ratings: 83005, num_edges: 18552 social_density: 0.01742, ratings_density: 0.00068
community_id: 3, num_users: 1262, num_items: 28441, num_ratings: 41697, num_edges: 9816 social_density: 0.01234, ratings_density: 0.00079
community_id: 10, num_users: 1168, num_items: 19219, num_ratings: 31566, num_edges: 6467 social_density: 0.00949, ratings_density: 0.00086


In [119]:
# 4 i 5 se cine najzanimljiviji, mrvicu pregust social_density
dataset = "lastfm"
items_rated_by_community, unique_items_rated_by_community_count, community_social_densities = get_items_rated_by_community(data, dataset)

original_social_density = len(data[dataset]["graph"].edges) / ((len(data[dataset]["graph"].nodes)*(len(data[dataset]["graph"].nodes)-1))/2)
original_rating_density = len(data[dataset]["ratings_all"]) / (len(set(data[dataset]["user_ids_all"])) * len(set(data[dataset]["item_ids_all"])))
print("orig_social_density: {0}, orig_ratings_density: {1}".format(round(original_social_density, 5), round(original_rating_density, 5)))

for community_id in data[dataset]["communities"]:
    num_users = len(data[dataset]["communities"][community_id])
    rating_density = len(items_rated_by_community[community_id]) / (num_users * len(items_rated_by_community[community_id]))
    num_edges = int((community_social_densities[community_id] * (num_users*(num_users-1))) / 2)
    if num_users > 100:
        print("community_id: {0}, num_users: {1}, num_items: {2}, num_ratings: {3}, num_edges: {4} social_density: {5}, ratings_density: {6}".format(community_id, num_users, unique_items_rated_by_community_count[community_id], len(items_rated_by_community[community_id]), num_edges, round(community_social_densities[community_id], 5), round(rating_density, 5)))

orig_social_density: 0.02622, orig_ratings_density: 0.00362
community_id: 0, num_users: 513, num_items: 104768, num_ratings: 530019, num_edges: 30742 social_density: 0.23409, ratings_density: 0.00195
community_id: 3, num_users: 1423, num_items: 197731, num_ratings: 1155093, num_edges: 26522 social_density: 0.02621, ratings_density: 0.0007
community_id: 2, num_users: 709, num_items: 152648, num_ratings: 696394, num_edges: 17826 social_density: 0.07102, ratings_density: 0.00141
community_id: 4, num_users: 417, num_items: 89581, num_ratings: 365104, num_edges: 15935 social_density: 0.18372, ratings_density: 0.0024
community_id: 5, num_users: 153, num_items: 72135, num_ratings: 195176, num_edges: 1674 social_density: 0.14396, ratings_density: 0.00654


In [134]:
dataset_community_to_store = {"epinions":0, "ciao":11, "lastfm":5}
final_social_edges = {}
final_user_item_ratings = {}
for dataset in dataset_community_to_store:
    community_id = dataset_community_to_store[dataset]
    community_user_ids = data[dataset]["communities"][community_id]
    G = data[dataset]["graph"].subgraph(community_user_ids)
    final_social_edges[dataset] = list(G.edges)
    final_user_item_ratings[dataset] =  dict((user_id, data[dataset]["all_user_ratings_dict"][user_id]) for user_id in community_user_ids if user_id in data[dataset]["all_user_ratings_dict"])

In [212]:
remapped_final_social_edges = {"epinions":[], "ciao":[], "lastfm":[]}
remapped_final_user_item_ratings = {"epinions":{}, "ciao":{}, "lastfm":{}}

user_id_mappings = {"epinions": {}, "ciao": {}, "lastfm": {}}
item_id_mappings = {"epinions": {}, "ciao": {}, "lastfm": {}}

user_item_ratings_to_store = {"epinions": [], "ciao": [], "lastfm": []}
train_user_item_ratings_to_store = {"epinions": [], "ciao": [], "lastfm": []}
test_user_item_ratings_to_store = {"epinions": [], "ciao": [], "lastfm": []}

for dataset in datasets:
    user_mapping_counter = 0
    item_mapping_counter = 0

    social_edges = final_social_edges[dataset]
    user_item_ratings = final_user_item_ratings[dataset]

    for social_edge in social_edges:
        source_user = social_edge[0]
        target_user = social_edge[1]
        if source_user not in user_id_mappings[dataset]:
            user_id_mappings[dataset][source_user] = user_mapping_counter
            user_mapping_counter += 1
        if target_user not in user_id_mappings[dataset]:
            user_id_mappings[dataset][target_user] = user_mapping_counter
            user_mapping_counter += 1
        remapped_final_social_edges[dataset].append((user_id_mappings[dataset][source_user], user_id_mappings[dataset][target_user]))
        remapped_final_social_edges[dataset].append((user_id_mappings[dataset][target_user], user_id_mappings[dataset][source_user]))

    for user_id in final_user_item_ratings[dataset]:
        user_id_mapping = user_id_mappings[dataset][user_id]
        if user_id_mapping not in remapped_final_user_item_ratings[dataset]:
            remapped_final_user_item_ratings[dataset][user_id_mapping] = {"item_ids": [], "ratings": []}
        for (item_id, rating) in zip(data[dataset]["all_user_ratings_dict"][user_id]["item_ids"], data[dataset]["all_user_ratings_dict"][user_id]["ratings"]):
            if item_id not in item_id_mappings[dataset]:
                item_id_mappings[dataset][item_id] = item_mapping_counter
                item_mapping_counter += 1
            item_id_mapping = item_id_mappings[dataset][item_id]
            remapped_final_user_item_ratings[dataset][user_id_mapping]["item_ids"].append(item_id_mapping)
            remapped_final_user_item_ratings[dataset][user_id_mapping]["ratings"].append(rating)

            user_item_ratings_to_store[dataset].append((user_id_mapping, item_id_mapping, rating))

    random.shuffle(user_item_ratings_to_store[dataset])
    train_user_item_ratings_to_store[dataset] = user_item_ratings_to_store[dataset][:math.floor(0.8*len(user_item_ratings_to_store[dataset]))]
    test_user_item_ratings_to_store[dataset] = user_item_ratings_to_store[dataset][math.floor(0.8*len(user_item_ratings_to_store[dataset])):]

    with open("./data/{0}/ratings_train_comm.tsv".format(dataset, dataset_community_to_store[dataset]), 'w+') as f:
        for (user_id_mapping, item_id_mapping, rating) in train_user_item_ratings_to_store[dataset]:
            f.write(str(user_id_mapping) + "\t" + str(item_id_mapping) + "\t" + str(rating) + "\n")

    with open("./data/{0}/ratings_test_comm.tsv".format(dataset), 'w+') as f:
        for (user_id_mapping, item_id_mapping, rating) in test_user_item_ratings_to_store[dataset]:
            f.write(str(user_id_mapping) + "\t" + str(item_id_mapping) + "\t" + str(rating) + "\n")

    with open("./data/{0}/social_connections_comm.tsv".format(dataset), 'w+') as f:
        for (source_user_id_mapping, target_user_id_mapping) in remapped_final_social_edges[dataset]:
            f.write(str(source_user_id_mapping) + "\t" + str(target_user_id_mapping) + "\n")

In [196]:
for dataset in datasets:
    print(len(user_item_ratings_to_store[dataset]))
    print(len(train_user_item_ratings_to_store[dataset]))
    print(len(test_user_item_ratings_to_store[dataset]))

46326
37060
9266
41186
32948
8238
195176
156140
39036


In [197]:
os.getcwd()

'/home/tduricic/Development/workspace/original-repos/GraphRec-WWW19'

In [206]:
print(' '.join(str(entry) for entry in train_user_item_ratings_to_store["epinions"][0]) + "\n")

380 6921 3



In [199]:
train_user_item_ratings_to_store["epinions"][0]

(380, 6921, 3)

In [181]:
lala[:math.floor(0.8*len(lala))]

[4, 3, 1]

In [169]:
user_item_ratings_to_store["ciao"]

[(1029, 26, 5),
 (633, 39, 5),
 (49, 100, 5),
 (162, 125, 5),
 (17, 769, 4),
 (302, 800, 5),
 (377, 820, 5),
 (100, 861, 4),
 (429, 901, 5),
 (214, 1093, 5),
 (428, 1189, 5),
 (8, 99, 5),
 (83, 1249, 4),
 (84, 39, 5),
 (23, 2028, 4),
 (67, 2142, 5),
 (178, 2148, 5),
 (201, 84, 4),
 (472, 2223, 2),
 (195, 2271, 4),
 (184, 2303, 5),
 (151, 2418, 4),
 (148, 2576, 5),
 (387, 45, 5),
 (59, 2728, 5),
 (61, 2768, 1),
 (50, 1063, 4),
 (90, 3261, 5),
 (117, 222, 5),
 (111, 2144, 5),
 (54, 3461, 4),
 (196, 3528, 5),
 (116, 3759, 5),
 (748, 40, 5),
 (1026, 3782, 4),
 (70, 3803, 4),
 (3, 3822, 5),
 (619, 3828, 4),
 (745, 3847, 5),
 (304, 3908, 5),
 (33, 3919, 4),
 (860, 3953, 5),
 (172, 4102, 4),
 (91, 4132, 5),
 (127, 870, 2),
 (415, 4157, 5),
 (812, 4172, 4),
 (167, 62, 5),
 (202, 58, 3),
 (923, 4261, 5),
 (105, 1264, 2),
 (79, 4323, 5),
 (224, 3635, 5),
 (414, 4616, 1),
 (160, 4657, 4),
 (5, 4699, 5),
 (464, 4747, 3),
 (714, 4755, 5),
 (430, 4799, 5),
 (143, 3831, 5),
 (123, 4940, 5),
 (193, 50

In [155]:
user_id_mappings["ciao"][4698]

1029

In [157]:
len(remapped_final_user_item_ratings["ciao"])

1253

In [165]:
lala = list(remapped_final_user_item_ratings["lastfm"])

In [166]:
max(lala)

152