In [1]:
import requests
import pandas as pd
import ast
from wikipedia import WikipediaPage
import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import networkx as nx
plt.style.use('ggplot') 
plt.ioff()
# %matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
def find_band_page(band):
    '''
    Use the Wikipedia API to find the proper band page. Run through variants before
    trying the raw band name in case the band is named after something more important
    that has its own page (e.g. Interpol)
    '''
    try:
        try:
            band_page = WikipediaPage(title=band+ ' (band)')
        except:
            try:
                band_page = WikipediaPage(title='The '+band+' (band)')
            except:
                try:
                    band_page = WikipediaPage(title='The '+band)
                except:
                    try:
                        band_page = WikipediaPage(title=band+ ' (musician)')
                    except:
                        band_page = WikipediaPage(title=band)
        return band_page.title.replace(' ','_')
    except:
        return 'band not found'

def find_members(band,main_members=[]):
    '''
    Scrape the wikipedia page to find the Members section (this isn't in the API).
    Include a list of the main band members so we don't just link back to main band.
    '''
    url = 'https://en.wikipedia.org/wiki/'+band
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    try:
        clean_members = []
        members_parent = soup.find('th', string='Members').parent
        members = members_parent.findAll('a')
        for member in members:
            if member.get('title') and member.get_text() not in main_members:
                clean_members.append(member)
        # also pick up any past members:
        try:
            members_parent = soup.find('th', string='Past members').parent
            members = members_parent.findAll('a')
            for member in members:
                if member.get('title') and member.get_text() not in main_members:
                    clean_members.append(member)
        except:
            pass
        return clean_members
    except:
        # if the band no longer exists, just check for a 'Past members' section:
        try:
            clean_members = []
            members_parent = soup.find('th', string='Past members').parent
            members = members_parent.findAll('a')
            for member in members:
                if member.get('title') and member.get_text() not in main_members:
                    clean_members.append(member)
            return clean_members
        except:
            # if we can't find members we might be looking at a solo artist's page,
            # in that case just return the page itself as the sole member.
            # check that there's an 'Associated acts' section first though.
            try:
                page_name = band.replace('_',' ')
                page_tag = bs4.element.Tag(name='a', attrs={'href':'/wiki/'+band,'title':page_name})
                page_tag.string = page_name
                return [page_tag]
            except:
                return ''
        
def get_members(band):
    '''
    This is for the main band, where we're not necessarily passing in the correct page name.
    '''
    band_page = find_band_page(band)
    members = find_members(band_page)
    if members:
        return members

In [3]:
def find_acts(member_link, band_page, member_list, main_members=[]):
    url = 'https://en.wikipedia.org'+member_link
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    try:
        acts_parent = soup.find('th', string='Associated acts').parent
        acts = acts_parent.findAll('a')
        clean_acts = []
        for act in acts:
            if band_page.lower() not in act.get('href').lower() and \
                act.get_text() not in member_list and \
                act.get_text() not in main_members and \
                '[' not in act.get_text():
                    clean_acts.append(act)
        return clean_acts
    except:
        return ''

In [4]:
def return_bands(band_page, main_members=[]):
    bands = {}
    members = find_members(band_page, main_members)
    if members:
        member_list = [x.get_text() for x in members]
        for member in members:
            if isinstance(member,bs4.element.Tag):
                bands[member]=find_acts(member.get('href'), band_page, member_list, main_members)
        return bands
    else:
        return ''

In [5]:
def get_all_bands(band):
    all_bands = {}
    band_page = find_band_page(band)
    band_name = band_page.replace('_',' ').replace(' (band)','').replace(' (musician)','')
    all_bands[band_name] = return_bands(band_page)
    main_members = [x.get_text() for x in get_members(band_page)]
    for sub_band in [y for x in all_bands[band_name].values() for y in x]:
        sub_band_page = sub_band.get('href').replace('/wiki/','')
        all_bands[sub_band] = return_bands(sub_band_page, main_members)
    return all_bands, band_name

### Building the graph:

Nodes are:
- the main dictionary keys (bands linked to main band members)
- values of sub dictionaries apart from main band (bands linked to members of linked bands)

Edges are:
- keys of all sub dictionaries (members of main band and members of linked bands)

Values of main band sub dictionary aren't needed as they're replicated as keys in the main dictionary.

### Additional:

I want to see if we can pull the band members from the end nodes and cross reference against the other bands to get more edges.

Need to:
- get the end bands from the values of the sub dictionaries
- get members for these bands that aren't in the next band up (i.e. the band key)
- check these members against the keys of the sub dictionaries and create edges wherever there's a match

Maybe need to rethink how the dictionary and sub dictionary keys are stored; at the moment they're just the text of the band name or member, but it might be better to preserve all of the html element info so we can be sure we're matching properly.

In [6]:
def get_nodes_edges(main_band, band_dict):
    '''
    Note - main_band needs to be the /wiki/ extension, will work that in later when I link everything.
    '''
    
    # get the main nodes:
    main_nodes = [x.get_text() if isinstance(x, bs4.element.Tag) else x for x in band_dict.keys()]
    
    # get the sub nodes:
    sub_nodes = []
    for key, value in band_dict.items():
        if isinstance(key, bs4.element.Tag):
            if isinstance(value,dict) and key.get('href').replace('/wiki/','')!=main_band:
                for sub_value in value.values():
                    for band in sub_value:
                        sub_nodes.append(band.get_text())
     
    # remove duplicates from sub_nodes
    sub_nodes = list(set(sub_nodes))

    # get the main edges:
    edges = []
    for key, value in band_dict.items():
        if isinstance(value,dict):
            for sub_key, sub_value in value.items():
                for band in sub_value:
                    if isinstance(key, bs4.element.Tag):
                        edges.append((key.get_text(), band.get_text(), sub_key.get_text()))
                    else:
                        edges.append((key, band.get_text(), sub_key.get_text()))
    
    # get the edges between end bands and other nodes:
    sub_node_pages = {}
    for key, value in band_dict.items():
        if isinstance(key, bs4.element.Tag):
            if isinstance(value,dict) and key.get('href').replace('/wiki/','')!=main_band:
                for sub_value in value.values():
                    sub_bands=[]
                    for band in sub_value:
                        sub_bands.append(band)
                    sub_node_pages[key]=sub_bands

    sub_band_members = {}
    for key, value in sub_node_pages.items():
        main_members = find_members(key.get('href').replace('/wiki',''))
        for band in value:
            sub_band_members[band]=find_members(band.get('href').replace('/wiki/',''),main_members)

    end_band_edges = []
    for key, value in sub_band_members.items():
        for member in value:
            for key1, value1 in sub_band_members.items():
                if key!=key1 and member.get('href') in [x.get('href') for x in value1]:
                    edge_var1 = (key.get_text(), key1.get_text(), member.get_text())
                    edge_var2 = (key1.get_text(), key.get_text(), member.get_text())
                    if edge_var1 not in edges and edge_var2 not in edges and edge_var1 not in end_band_edges and edge_var2 not in end_band_edges:
                        end_band_edges.append(edge_var1)
                    
    return (main_nodes, sub_nodes, edges, end_band_edges)
    
def build_graph(band_dict, main_nodes, sub_nodes, edges, end_band_edges):
    
    G = nx.Graph()
    
    G.add_nodes_from(main_nodes)
    G.add_nodes_from(sub_nodes)
    
    for band1, band2, member in edges:
        G.add_edge(band1, band2, name=member)
        
    for band1, band2, member in end_band_edges:
        G.add_edge(band1, band2, name=member)
        
    return G

In [19]:
def draw_graph(g, band_name, nodes1):
    plt.ioff()
    labels = {a:b for a,b in zip(g.nodes,[x.replace(' ','\n') for x in g.nodes])}
    color_list = pd.Series(g.nodes).map(lambda x: '#708090' if x==band_name else '#7a977a' if x in nodes1 else '#707070')
    node_sizes = pd.Series(g.nodes).map(lambda x: 4000 if x==band_name else 2000 if x in nodes1 else 700)
    fig_width = int(2.5*len(g.nodes)**0.5)
    if fig_width < 10: fig_width=10
    fig, ax = plt.subplots(figsize=(fig_width,fig_width))

    nx.draw(g,
            labels=labels,
            node_size=node_sizes,
            node_shape='o',
            node_color=color_list,
            width=1,
            font_color='white',
            font_weight='bold',
            edge_color='white',
            ax=ax)
    
    ax.set_title(band_name,color='white',fontsize=30,loc='left')

    fig.set_facecolor("#00000F")
    
    plt.savefig('graph_output/'+band_name.lower().replace(' ','-')+'.png',
                facecolor='#00000F',
                bbox_inches=None,
                pad_inches=0,)
    
def main(band):
    try:
        band_dict, band_name = get_all_bands(band)
        band_page = find_band_page(band)
        nodes1, nodes2, edges, end_band_edges = get_nodes_edges(band_page, band_dict)
        band_graph = build_graph(band_dict, nodes1, nodes2, edges, end_band_edges)
        draw_graph(band_graph, band_name, nodes1)
        betweenness = nx.betweenness_centrality(band_graph)
    #         bands = pd.DataFrame.from_dict(betweenness, orient='index').reset_index()
    #         bands.columns = ['band','betweenness']
    #         most_central_bands = list(bands.sort_values('betweenness',ascending=False)[bands.betweenness>0].iloc[:10].band)
        return (band_name,
                band_graph.number_of_nodes(),
                band_graph.number_of_edges(),
                nx.average_clustering(band_graph),
                nx.betweenness_centrality(band_graph),
                nodes1)
    except:
        return band, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

In [21]:
for band in ['Shellac','Fugazi']:
    main(band)

### Getting a band list:

In [13]:
nme_list = '''100. Deerhunter
99. The Cure
98. Yeah Yeah Yeahs
97. Iceage
96. Country Teasers
95. Dirty Projectors
94. Richard Hawley
93. Black Lips
92. St Vincent
91. Foals
90. Flying Lotus
89. Simple Minds
88. Oneohtrix Point Never
87. Billy Bragg
86. The Triffids
85. Black Flag
84. Nine Inch Nails
83. The Jesus and Mary Chain
82. Massive Attack
81. Animal Collective
80. Dusty Springfield
79. Suicide
78. Stevie Wonder
77. Best Coast
76. Vampire Weekend
75. The Wedding Present
74. Slint
73. Wu-Tang Clan
72. Grimes
71. Rilo Kiley
70. Pharrell
69. Nick Lowe
68. The National
67. Jay Z
66. The Slits
65. Diplo
64. The Zombies
63. Talk Talk
62. The Stooges
61. DJ Shadow
60. DJ Rashad
59. Chic
58. Black Sabbath
57. James Blake
56. Happy Mondays
55. The Chills
54. Aphex Twin
53. The Fall
52. Nas
51. Television
50. Sufjan Stevens
49. Death From Above 1979
48. Bat For Lashes
47. The Cars
46. Wiley
45. T Rex
44. Bikini Kill
43. New Order
42. PJ Harvey
41. Led Zeppelin
40. Tame Impala
39. Brian Jonestown Massacre
38. Dr Dre
37. Kraftwerk
36. My Bloody Valentine
35. TV on the Radio
34. Depeche Mode
33. The Knife
32. Pavement
31. Bjork
30. Bon Iver
29. Bruce Springsteen
28. Beck
27. The Stone Roses
26. Fleetwood Mac
25. Nirvana
24. Queens of the Stone Age
23. Burial
22. Sonic Youth
21. Hole
20. Prince
19. Neutral Milk Hotel
18. Aaliyah
17. Blur
16. The Velvet Underground
15. Jonathan Richman & The Modern Lovers
14. The Clash
13. Joy Division
12. The Breeders
11. The Smiths
10. The xx
9. Nick Cave
8. Kate Bush
7. The Gun Club
6. The Flaming Lips
5. The Strokes
4. The White Stripes
3. Kanye West
2. David Bowie
1. Radiohead'''

In [14]:
band_list = [x.split('.')[1].strip() for x in nme_list.split('\n')]

Going to add some more that I want to see:

In [15]:
band_list += ['Fugazi',
              'Talking Heads',
              'Pixies',
              'Shellac',
              'Godspeed You! Black Emperor',
              'NOFX',
              'Rancid',
              'Boredoms',
              'Themselves',
              'N.W.A',
              'Elliott Smith']

In [16]:
band_list

['Deerhunter',
 'The Cure',
 'Yeah Yeah Yeahs',
 'Iceage',
 'Country Teasers',
 'Dirty Projectors',
 'Richard Hawley',
 'Black Lips',
 'St Vincent',
 'Foals',
 'Flying Lotus',
 'Simple Minds',
 'Oneohtrix Point Never',
 'Billy Bragg',
 'The Triffids',
 'Black Flag',
 'Nine Inch Nails',
 'The Jesus and Mary Chain',
 'Massive Attack',
 'Animal Collective',
 'Dusty Springfield',
 'Suicide',
 'Stevie Wonder',
 'Best Coast',
 'Vampire Weekend',
 'The Wedding Present',
 'Slint',
 'Wu-Tang Clan',
 'Grimes',
 'Rilo Kiley',
 'Pharrell',
 'Nick Lowe',
 'The National',
 'Jay Z',
 'The Slits',
 'Diplo',
 'The Zombies',
 'Talk Talk',
 'The Stooges',
 'DJ Shadow',
 'DJ Rashad',
 'Chic',
 'Black Sabbath',
 'James Blake',
 'Happy Mondays',
 'The Chills',
 'Aphex Twin',
 'The Fall',
 'Nas',
 'Television',
 'Sufjan Stevens',
 'Death From Above 1979',
 'Bat For Lashes',
 'The Cars',
 'Wiley',
 'T Rex',
 'Bikini Kill',
 'New Order',
 'PJ Harvey',
 'Led Zeppelin',
 'Tame Impala',
 'Brian Jonestown Massacre

In [17]:
band_stats = {'band_name':[],
              'node_count':[],
              'edge_count':[],
              'ave_clustering':[],
              'betweenness':[],
              'nodes1':[]}

for band in band_list:
    band_name, node_count, edge_count, ave_clustering, betweenness, nodes1 = main(band)
    band_stats['band_name'].append(band_name)
    band_stats['node_count'].append(node_count)
    band_stats['edge_count'].append(edge_count)
    band_stats['ave_clustering'].append(ave_clustering)
    band_stats['betweenness'].append(betweenness)
    band_stats['nodes1'].append(nodes1)




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [18]:
output = pd.DataFrame(band_stats)

In [19]:
output.head()

Unnamed: 0,band_name,node_count,edge_count,ave_clustering,betweenness,nodes1
0,Deerhunter,20.0,20.0,0.101786,"{'Deerhunter': 0.7777777777777777, 'Lotus Plaz...","[Deerhunter, Lotus Plaza, Black Lips, Stereola..."
1,The Cure,152.0,182.0,0.170763,"{'The Cure': 0.8324150110375275, 'Siouxsie and...","[The Cure, Siouxsie and the Banshees, the Glov..."
2,Yeah Yeah Yeahs,93.0,126.0,0.229509,"{'Yeah Yeah Yeahs': 0.6791686574295269, 'Nativ...","[Yeah Yeah Yeahs, Native Korean Rock, Swans, H..."
3,Iceage,,,,,
4,Country Teasers,,,,,


In [20]:
output[output.node_count=='N/A']

Unnamed: 0,band_name,node_count,edge_count,ave_clustering,betweenness,nodes1
3,Iceage,,,,,
4,Country Teasers,,,,,
7,Black Lips,,,,,
8,St Vincent,,,,,
23,Best Coast,,,,,
28,Grimes,,,,,
40,DJ Rashad,,,,,
41,Chic,,,,,
42,Black Sabbath,,,,,
43,James Blake,,,,,


In [24]:
get_all_bands('St. Vincent')

({'St. Vincent (musician)': {<a href="/wiki/St._Vincent_(musician)" title="St. Vincent (musician)">St. Vincent (musician)</a>: [<a href="/wiki/The_Polyphonic_Spree" title="The Polyphonic Spree">The Polyphonic Spree</a>,
    <a href="/wiki/Jack_Antonoff" title="Jack Antonoff">Jack Antonoff</a>,
    <a href="/wiki/Andrew_Bird" title="Andrew Bird">Andrew Bird</a>,
    <a href="/wiki/The_Black_Keys" title="The Black Keys">The Black Keys</a>,
    <a href="/wiki/Bon_Iver" title="Bon Iver">Bon Iver</a>,
    <a href="/wiki/David_Byrne" title="David Byrne">David Byrne</a>,
    <a href="/wiki/The_Mountain_Goats" title="The Mountain Goats">The Mountain Goats</a>,
    <a href="/wiki/Sufjan_Stevens" title="Sufjan Stevens">Sufjan Stevens</a>,
    <a href="/wiki/Swans_(band)" title="Swans (band)">Swans</a>,
    <a href="/wiki/Tuck_%26_Patti" title="Tuck &amp; Patti">Tuck &amp; Patti</a>,
    <a href="/wiki/Doveman" title="Doveman">Thomas Bartlett</a>]},
  <a href="/wiki/The_Polyphonic_Spree" title="T

In [25]:
band_stats2 = {'band_name':[],
               'node_count':[],
               'edge_count':[],
               'ave_clustering':[],
               'betweenness':[],
               'nodes1':[]}

for band in ['Chic','Beck','St. Vincent']:
    band_name, node_count, edge_count, ave_clustering, betweenness, nodes1 = main(band)
    band_stats2['band_name'].append(band_name)
    band_stats2['node_count'].append(node_count)
    band_stats2['edge_count'].append(edge_count)
    band_stats2['ave_clustering'].append(ave_clustering)
    band_stats2['betweenness'].append(betweenness)
    band_stats2['nodes1'].append(nodes1)
    

In [26]:
output2 = pd.DataFrame(band_stats2)

In [27]:
output2

Unnamed: 0,band_name,node_count,edge_count,ave_clustering,betweenness,nodes1
0,Chic,692,896,0.104079,"{'Chic': 0.777898752455725, 'Sister Sledge': 0...","[Chic, Sister Sledge, The Sugarhill Gang, Lady..."
1,Beck,62,78,0.213309,"{'Beck': 0.8530054644808743, 'Charlotte Gainsb...","[Beck, Charlotte Gainsbourg, The Flaming Lips,..."
2,St. Vincent (musician),107,123,0.115952,"{'St. Vincent (musician)': 0.653878406708595, ...","[St. Vincent (musician), The Polyphonic Spree,..."


In [29]:
final_output = pd.concat((output[output.node_count!='N/A'],output2)).reset_index(drop=True)

In [33]:
final_output = final_output[final_output.node_count>10]

In [34]:
final_output.to_csv('output.csv',index=False)

### Just noticed that where artists are listed as 'musician' in their wikipedia links this is pulling through to the band name field. Will re-run the graphs for those artists and remove it from the titles.

In [23]:
# output = pd.read_csv('output.csv', converters={4:ast.literal_eval,5:ast.literal_eval})
# don't actually want to open as literal_eval, need plain string so we can replace.
output = pd.read_csv('output.csv')

In [24]:
output[output.band_name.str.contains('\(')]

Unnamed: 0,band_name,node_count,edge_count,ave_clustering,betweenness,nodes1
37,Wiley (musician),51,111,0.510024,"{'Wiley (musician)': 0.031034661483641077, 'Ro...","['Wiley (musician)', 'Roll Deep', 'Boy Better ..."
89,St. Vincent (musician),107,123,0.115952,"{'St. Vincent (musician)': 0.653878406708595, ...","['St. Vincent (musician)', 'The Polyphonic Spr..."


In [25]:
def replace_musician(text):
    text = text.replace(' (musician)','')
    return text

In [26]:
output.band_name = output.band_name.map(lambda x: replace_musician(x))
output.betweenness = output.betweenness.map(lambda x: replace_musician(x))
output.nodes1 = output.nodes1.map(lambda x: replace_musician(x))

In [27]:
output[output.band_name.str.contains('\(')]

Unnamed: 0,band_name,node_count,edge_count,ave_clustering,betweenness,nodes1


In [32]:
output.to_csv('output.csv',index=False)