In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import os
from tqdm import tqdm_notebook

# Create Cleaned Version of AllMusic Data

## Influence Relationships

In [2]:
# Load influence relationships
influences = pd.read_csv('data/allmusic/influences.txt', header=None, names=['influencer_name', 'influencer_url', 'follower_name', 'follower_url']).drop_duplicates()

In [3]:
influences.head()

Unnamed: 0,influencer_name,influencer_url,follower_name,follower_url
0,Brigitte Fontaine,https://www.allmusic.com/artist/brigitte-fonta...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
1,Kate Bush,https://www.allmusic.com/artist/kate-bush-mn00...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
2,Nina Hagen,https://www.allmusic.com/artist/nina-hagen-mn0...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
3,Zeena Parkins,https://www.allmusic.com/artist/zeena-parkins-...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...
4,David Bowie,https://www.allmusic.com/artist/david-bowie-mn...,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...


In [4]:
# Create columns for influencer id and follower id
influences['influencer_id'] = influences['influencer_url'].apply(lambda x: x.split('-mn')[-1])
influences['follower_id'] = influences['follower_url'].apply(lambda x: x.split('-mn')[-1])

In [5]:
# Drop rows where influencer_id and follower_id are duplicated
influences = influences.drop_duplicates(subset=['influencer_id', 'follower_id'])

In [7]:
# Write to csv
influences.to_csv('data/allmusic/influences_cleaned.csv', index=False)

## Artist Information

In [8]:
# Load artist information
artists = pd.read_csv('data/allmusic/artists.txt', header=None, names=['name', 'url', 'active_period', 'genres', 'styles']).drop_duplicates()

In [9]:
artists.head()

Unnamed: 0,name,url,active_period,genres,styles
0,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...,1970s - 2010s,Pop/Rock|Electronic,Alternative/Indie Rock|Experimental Rock|Alter...
1,Brigitte Fontaine,https://www.allmusic.com/artist/brigitte-fonta...,1970s - 2000s,Pop/Rock|International,Experimental|French Pop|French|Western Europea...
2,Kate Bush,https://www.allmusic.com/artist/kate-bush-mn00...,1970s - 2010s,Pop/Rock,Art Rock|Alternative/Indie Rock|College Rock|A...
3,Nina Hagen,https://www.allmusic.com/artist/nina-hagen-mn0...,1970s - 2010s,Pop/Rock,Alternative Pop/Rock|Alternative/Indie Rock|Ne...
4,Zeena Parkins,https://www.allmusic.com/artist/zeena-parkins-...,1980s - 2010s,Avant-Garde|Jazz,Free Improvisation|Modern Composition


In [10]:
# Create a column for unique id, which can be extracted from URL
artists['id'] = artists['url'].apply(lambda x: x.split('-mn')[-1])

In [11]:
# Fill NaN genres with "Unknown"
artists['genres'] = artists['genres'].fillna('Unknown')
# Extract first genre in the genres list to use as the main genre for the artist
artists['main_genre'] = artists['genres'].apply(lambda x: x.split('|')[0])

In [12]:
# Drop rows where artist id is duplicated
artists = artists.drop_duplicates(subset=['id'])

In [2]:
artists = pd.read_csv('data/allmusic/artists_cleaned.csv')

In [23]:
print "Percent artists with active period:", sum(artists['active_period'].notnull()) / float(len(artists))

Percent artists with active period: 0.910261015326


In [24]:
# Extract start of active period
artists['active_start'] = artists['active_period'].apply(lambda x: x.split(' - ')[0].strip('s') if type(x) == str else None)

In [27]:
# Write to csv
artists.to_csv('data/allmusic/artists_cleaned.csv', index=False)

# Clean Song Year Data

In [4]:
# Load artist info
artists = pd.read_csv('data/allmusic/artists_cleaned.csv')
# Drop rows with missing artist id or name
artists = artists.dropna(subset=['name', 'id'])
# Create lookup dictionaries between artist id and name
id_to_name = {id : name for (id, name) in zip(artists['id'], artists['name'])}
# Load song dataframe with years 
songs = pd.read_csv('data/artist_song_list_years.csv')

In [15]:
# Plug in 0 for missing years
songs['year'] = songs['year'].fillna(0)

In [38]:
AUDIO_DIR = '/Volumes/thesis/audio/'

names_no_ext = []
track_numbers = []

# Get lists of full song name with no file extension and track numbers
for id in tqdm_notebook(os.listdir(AUDIO_DIR)):
    if int(id) in id_to_name.keys():
        for song_name in os.listdir(AUDIO_DIR + id):
            names_no_ext.append(song_name.split('.mp3')[0])
            track_numbers.append(song_name.split('_')[0])

A Jupyter Widget




In [43]:
# Add in columns for full song file name (without file extension) and track number (zero-indexed)
songs['name_no_ext'] = names_no_ext
songs['track_number'] = track_numbers

In [46]:
# Save cleaned dataset
songs.to_csv('data/artist_song_list_years_cleaned.csv', index=False)