In [1]:
import networkx as nx
import numpy as np
import pandas as pd

In [2]:
# Load artist information
artists = pd.read_csv('data/allmusic/artists.txt', header=None, names=['name', 'url', 'active_period', 'genres', 'styles'])

In [6]:
# Create a column for unique id, which can be extracted from URL
artists['id'] = artists['url'].apply(lambda x: x.split('-mn')[-1])

In [15]:
# Load list of artist ids that we have audio for
ids_with_audio_raw = open('data/artists_with_audio.txt', 'r').readlines()
# Apply mild cleaning to isolate id strings
ids_with_audio = map(lambda x: x.split('PRE ')[-1].strip('/\n'), ids_with_audio_raw)

In [21]:
# Compute id overlap between 2 lists
print "Number with audio:", len(ids_with_audio)
print "Number ids total:", len(np.unique(artists['id']))
print "Percentage node overlap:", len(ids_with_audio) / float(len(np.unique(artists['id'])))

Number with audio: 15460
Number ids total: 16704
Percentage node overlap: 0.925526819923


In [22]:
# Load influence relationships
influences = pd.read_csv('data/allmusic/influences.txt', header=None, names=['influencer_name', 'influencer_url', 'follower_name', 'follower_url'])

# Create full directed graph from influence relationships
full_influence_graph = nx.DiGraph()

for row in influences.itertuples():
    full_influence_graph.add_edge(row.influencer_url.split('-mn')[-1], row.follower_url.split('-mn')[-1])



In [25]:
# Create subgraph based on nodes we have audio for
audio_subgraph = full_influence_graph.subgraph(ids_with_audio)

In [26]:
print "Number of edges in subgraph:", audio_subgraph.number_of_edges()
print "Number of edges in full graph:", full_influence_graph.number_of_edges()
print "Percentage edge overlap:", audio_subgraph.number_of_edges() / float(full_influence_graph.number_of_edges())

Number of edges in subgraph: 89007
Number of edges in full graph: 93065
Percentage edge overlap: 0.956396067265
