In [4]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import itertools
from collections import OrderedDict
from tqdm import tqdm_notebook, tqdm
from scipy import stats

%matplotlib inline

# Load AllMusic Graph

In [2]:
# Load artist information
artists = pd.read_csv('data/allmusic/artists.txt', header=None, names=['name', 'url', 'active_period', 'genres', 'styles'])
# Load influence relationships
influences = pd.read_csv('data/allmusic/influences.txt', header=None, names=['influencer_name', 'influencer_url', 'follower_name', 'follower_url'])

In [3]:
# Create directed graph from influence relationships
allmusic = nx.DiGraph()

for row in influences.itertuples():
    allmusic.add_edge(row.influencer_name, row.follower_name)

# Load MusicBrainz Collaborators

In [5]:
# Load collaboration data from MusicBrainz
mb = nx.read_gexf('data/musicbrainz/sdn-unweighted.gexf')

In [6]:
# Summary of collaboration graph
print nx.info(mb)

Name: 
Type: Graph
Number of nodes: 271442
Number of edges: 650920
Average degree:   4.7960


In [20]:
print nx.number_connected_components(mb)

26654


# Compute Edge Overlap Between AllMusic Graph and MusicBrainz Collaboration Graph

In [8]:
allmusic_edges = set(allmusic.edges())
# Define the reverse order of each edge tuple to be an influence relationship as well
# since the mb graph is undirected
mb_edges = set(mb.edges() + [(edge[1], edge[0]) for edge in mb.edges()])

In [16]:
print "Number of edges in intersection:", len(allmusic_edges.intersection(mb_edges))
print "Percentage of AllMusic edges covered:", len(allmusic_edges.intersection(mb_edges)) / float(len(allmusic_edges))

Number of edges in intersection: 2936
Percentage of AllMusic edges covered: 0.035312654102


# Compute Node Overlap

In [17]:
allmusic_nodes = set(allmusic.nodes())
mb_nodes = set(mb.nodes())

print "Number of nodes in intersection:", len(allmusic_nodes.intersection(mb_nodes))
print "Percentage of AllMusic nodes covered:", len(allmusic_nodes.intersection(mb_nodes)) / float(len(allmusic_nodes))

Number of nodes in intersection: 9585
Percentage of AllMusic nodes covered: 0.636919396638


# Compute Edge Overlap For Subset of Nodes Contained within Node Overlap

In [22]:
intersection_nodes = allmusic_nodes.intersection(mb_nodes)
# Filter for edges where both nodes are contained within the node intersection of the two datasets
allmusic_edges_filtered = set([edge for edge in allmusic.edges() if edge[0] in intersection_nodes and edge[1] in intersection_nodes])
print "Percentage of filtered AllMusic edges covered:", len(allmusic_edges.intersection(mb_edges)) / float(len(allmusic_edges_filtered))

 Percentage of filtered AllMusic edges covered: 0.0527175767152
