In [2]:
import networkx as nx
import numpy as np
import pandas as pd

# Create Cleaned Version of AllMusic Data

## Influence Relationships

In [15]:
# Load influence relationships
influences = pd.read_csv('data/allmusic/influences.txt', header=None, names=['influencer_name', 'influencer_url', 'follower_name', 'follower_url']).drop_duplicates()

In [12]:
influences.head()

Unnamed: 0,influencer_name,influencer_url,follower_name,follower_url
0,Brigitte Fontaine,https://www.allmusic.com/artist/brigitte-fonta...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
1,Kate Bush,https://www.allmusic.com/artist/kate-bush-mn00...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
2,Nina Hagen,https://www.allmusic.com/artist/nina-hagen-mn0...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
3,Zeena Parkins,https://www.allmusic.com/artist/zeena-parkins-...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
4,David Bowie,https://www.allmusic.com/artist/david-bowie-mn...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...


In [17]:
# Create columns for influencer id and follower id
influences['influencer_id'] = influences['influencer_url'].apply(lambda x: x.split('-mn')[-1])
influences['follower_id'] = influences['follower_url'].apply(lambda x: x.split('-mn')[-1])

In [18]:
# Drop rows where influencer_id and follower_id are duplicated
influences = influences.drop_duplicates(subset=['influencer_id', 'follower_id'])

In [21]:
# Write to csv
influences.to_csv('data/allmusic/influences_cleaned.csv', index=False)

## Artist Information

In [23]:
# Load artist information
artists = pd.read_csv('data/allmusic/artists.txt', header=None, names=['name', 'url', 'active_period', 'genres', 'styles']).drop_duplicates()

In [24]:
artists.head()

Unnamed: 0,name,url,active_period,genres,styles
0,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...,1970s - 2010s,Pop/Rock|Electronic,Alternative/Indie Rock|Experimental Rock|Alter...
1,Brigitte Fontaine,https://www.allmusic.com/artist/brigitte-fonta...,1970s - 2000s,Pop/Rock|International,Experimental|French Pop|French|Western Europea...
2,Kate Bush,https://www.allmusic.com/artist/kate-bush-mn00...,1970s - 2010s,Pop/Rock,Art Rock|Alternative/Indie Rock|College Rock|A...
3,Nina Hagen,https://www.allmusic.com/artist/nina-hagen-mn0...,1970s - 2010s,Pop/Rock,Alternative Pop/Rock|Alternative/Indie Rock|Ne...
4,Zeena Parkins,https://www.allmusic.com/artist/zeena-parkins-...,1980s - 2010s,Avant-Garde|Jazz,Free Improvisation|Modern Composition


In [25]:
# Create a column for unique id, which can be extracted from URL
artists['id'] = artists['url'].apply(lambda x: x.split('-mn')[-1])

In [28]:
# Fill NaN genres with "Unknown"
artists['genres'] = artists['genres'].fillna('Unknown')
# Extract first genre in the genres list to use as the main genre for the artist
artists['main_genre'] = artists['genres'].apply(lambda x: x.split('|')[0])

In [29]:
# Drop rows where artist id is duplicated
artists = artists.drop_duplicates(subset=['id'])

In [31]:
# Write to csv
artists.to_csv('data/allmusic/artists_cleaned.csv', index=False)