In [1]:
import networkx as nx

def get_artists(graph: nx.Graph):
    with open('../../data/hetrec2011-lastfm-2k/artists.dat', encoding='UTF-8') as f:
        for i, line in enumerate(f.readlines()):
            words = line.split('\t')
            if i!=0:
                graph.add_node('a'+words[0], bipartite=0)
    return graph

def get_users(graph: nx.Graph):
    with open('../../data/hetrec2011-lastfm-2k/user_friends.dat', encoding='UTF-8') as f:
        added = set()
        for i, line in enumerate(f.readlines()):
            words = line.split('\t')
            if i!=0:
                if words[0] not in added:
                    graph.add_node('u'+words[0].strip(), bipartite=1)
                    added.add(words[0])
                if words[1] not in added:
                    graph.add_node('u'+words[1].strip(), bipartite=1)
                    added.add(words[1])
    return graph

def get_edges(graph:nx.Graph):
    with open('../../data/hetrec2011-lastfm-2k/user_artists.dat', encoding='UTF-8') as f:
        for i, line in enumerate(f.readlines()):
            words = line.split('\t')
            if i!=0:
                graph.add_edge('u'+words[0], 'a'+words[1], weight=int(words[2].strip()))
            
    return graph

def get_timestamps(graph:nx.Graph):
    with open('../../data/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat', encoding='UTF-8') as f:
        dictionary_of_scores = dict()
        for i, line in enumerate(f.readlines()):
            words = line.split('\t')
            if i!=0:
                dictionary_of_scores[('u'+words[0], 'a'+words[1])] = words[3].strip()

        nx.set_edge_attributes(graph, dictionary_of_scores, name='timestamp')
        return graph

if __name__=='__main__':
    graph = nx.Graph()
    graph = get_artists(graph)
    graph = get_users(graph)
    graph = get_edges(graph)
    graph = get_timestamps(graph)
    #print(graph.edges(data=True))
    print(graph.number_of_nodes())
    print(graph.number_of_edges())

19524
92834


In [2]:
graph_real = nx.readwrite.gpickle.read_gpickle("../../data/network.gpickle")
print("Number of nodes: ", graph_real.number_of_nodes())
print("Number of edges:", graph_real.number_of_edges())
print("\nNumber of connected components:", nx.number_connected_components(graph_real))

Number of nodes:  8678
Number of edges: 20665

Number of connected components: 188


In [3]:
from networkx.algorithms import bipartite
Gcc = sorted(nx.connected_components(graph_real), key=len, reverse=True)
G0 = graph_real.subgraph(Gcc[0])
print("All nodes:", G0.number_of_nodes()) # number of nodes in the biggest component:
users_nodes = [node for node in G0.nodes if G0.nodes[node]['bipartite'] ==0]
artists_nodes   = [node for node in G0.nodes if G0.nodes[node]['bipartite'] ==1]
print("Artists:", len(artists_nodes), ", Users:", len(users_nodes))
print("Number of edges:", G0.number_of_edges())

All nodes: 8132
Artists: 6496 , Users: 1636
Number of edges: 20306


In [4]:
import numpy as np
avg_node_degree = np.mean([tup[1] for tup in G0.degree])
print("Average node degree:  ", avg_node_degree)
# print(nx.algorithms.cluster.average_clustering(G0)) # The clustering coefficient is 0 for a bipartite graph!!
# Average number of artists listened by user:
avg_user_degree   = np.mean([tup[1] for tup in G0.degree if tup[0] in   users_nodes])
avg_artist_degree = np.mean([tup[1] for tup in G0.degree if tup[0] in artists_nodes])
print("Average user degree: ", avg_user_degree)
print("Average artist degree:", avg_artist_degree)

Average node degree:   4.994097393015249
Average user degree:  12.4119804400978
Average artist degree: 3.125923645320197


In [5]:
from pathlib import Path

import pandas as pd


class TrainTestGenerator:
    def __init__(self, data_dir: str):
        self.data_dir = Path(data_dir)

    def prepare_data(self):
        # Read the raw data
        df_user_artists = pd.read_table(self.data_dir / "hetrec2011-lastfm-2k" / "user_artists.dat")
        df_tagged = pd.read_table(self.data_dir / "hetrec2011-lastfm-2k" / "user_taggedartists-timestamps.dat")

        # Remove duplicate tags (keep first tag)
        df_tagged = df_tagged.groupby(["userID", "artistID", "tagID"])["timestamp"].min().reset_index()

        # Merge the datasets - to have weights and timestamps
        df = pd.merge(
            df_user_artists,
            df_tagged,
            on=["userID", "artistID"]
        ).drop_duplicates(subset=["userID", "artistID"])

        # Parse datetime
        df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
        # Filter data
        df = df[df["timestamp"].dt.year > 2000]
        df = df.reset_index(drop=True)

        return df

    def forward_chaining(self):
        data = self.prepare_data()

        for test_year in range(2008, 2011+1):
            train = data[data["timestamp"].dt.year < test_year]
            test = data[data["timestamp"].dt.year == test_year]

            yield test_year, train, test

In [16]:
class_example = TrainTestGenerator("../../data")
data = class_example.prepare_data()
print("Unique users:  ", len(data['userID'].unique()))
print("Unique artists:", len(data['artistID'].unique()))
print("Unique tags:  ", len(data))
print("Average user   node degree:", round(len(data) / len(data['userID'  ].unique()), 2))
print("Average artist node degree: ", round(len(data) / len(data['artistID'].unique()), 2))

Unique users:   1824
Unique artists: 6854
Unique tags:   20664
Average user   node degree: 11.33
Average artist node degree:  3.01


In [49]:
data = data.sort_values(by='timestamp')
earliest = min(data['timestamp'])
col = {}
for time in data['timestamp']:
    num_days = (time-earliest).days
    col[num_days]=(num_days, time, len(data.loc[data['timestamp'] <= time]['userID'].unique()),
                  len(data.loc[data['timestamp'] <= time]['artistID'].unique()),
                  len(data.loc[data['timestamp'] <= time]['userID']))

for key in col.keys():
    print(key, ":", col[key])

0 : (0, Timestamp('2005-07-31 22:00:00'), 11, 131, 149)
31 : (31, Timestamp('2005-08-31 22:00:00'), 19, 152, 173)
61 : (61, Timestamp('2005-09-30 22:00:00'), 22, 170, 192)
92 : (92, Timestamp('2005-10-31 23:00:00'), 24, 195, 228)
122 : (122, Timestamp('2005-11-30 23:00:00'), 33, 241, 288)
153 : (153, Timestamp('2005-12-31 23:00:00'), 35, 268, 322)
184 : (184, Timestamp('2006-01-31 23:00:00'), 41, 311, 373)
212 : (212, Timestamp('2006-02-28 23:00:00'), 47, 364, 456)
243 : (243, Timestamp('2006-03-31 22:00:00'), 56, 397, 513)
273 : (273, Timestamp('2006-04-30 22:00:00'), 63, 426, 569)
304 : (304, Timestamp('2006-05-31 22:00:00'), 70, 457, 619)
334 : (334, Timestamp('2006-06-30 22:00:00'), 84, 510, 722)
365 : (365, Timestamp('2006-07-31 22:00:00'), 101, 599, 859)
396 : (396, Timestamp('2006-08-31 22:00:00'), 116, 665, 979)
426 : (426, Timestamp('2006-09-30 22:00:00'), 134, 758, 1135)
457 : (457, Timestamp('2006-10-31 23:00:00'), 148, 839, 1314)
487 : (487, Timestamp('2006-11-30 23:00:00')

In [93]:
# print(earliest) # 2005-07-31 22:00:00
%matplotlib qt
x_days        = [col[key][ 0] for key in col.keys()]
y_connections = [col[key][-1] for key in col.keys()]
y_users       = [col[key][-3] for key in col.keys()]
y_artists     = [col[key][-2] for key in col.keys()]

years = [str(year) for year in np.arange(2006, 2012)]
year_days = [(pd.to_datetime(year+"-01-01")-earliest).days for year in years]

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6,5))
plt.plot(x_days, y_connections, linewidth=5, color="red", label="# Tags")
plt.plot(x_days, y_artists, linewidth=5, color="blue", label="# Artists")
plt.plot(x_days, y_users, linewidth=5, color="green", label="# Users")
plt.xlim([min(x_days), max(x_days)])
plt.xticks(year_days, years, fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="upper left", fontsize=18)
plt.xlabel("Time [years]", fontsize=15)
plt.grid(True)
plt.title("The growth of the database through time", fontsize=15)
plt.tight_layout()
plt.show()