In [1]:
# Resident Advisor Extractor

This class aim to extract the dataframe of all events in Resident Advisor and create DataFrame such as : Club Location, Event Per Clubs, Club Attendance...

## Retrieving the data

In [2]:
import numpy as np
import pandas as pd
import requests
import bs4

In [3]:
baseURL = "https://www.residentadvisor.net/"
clubListURL = "https://www.residentadvisor.net/clubs.aspx"
content = requests.get(clubListURL).text

soup = bs4.BeautifulSoup(content, "html5lib")

clubDF = pd.DataFrame(columns = ["ClubID","Name","Address"])

for tdTag in soup.findAll('li'):
    for clubs in tdTag.findAll('li'):
        if('class' in clubs.attrs and len(clubs)==2):
            #Parsing infos
            clubID = "null"
            address = "null"
            name = "null"
            for line in clubs:
                if(line.a == None):
                    address = line.text
                else:
                    name = line.text
                    clubID = line.a['href'].split("=")[1]
            #print(str(clubID)+",,"+str(adress)+",,"+str(name))
            df = pd.Series([clubID,name,address],['ClubID','Name','Address'])
            clubDF = clubDF.append(df, ignore_index=True)
            #print(df) 
clubDF.head(220)

Unnamed: 0,ClubID,Name,Address
0,55816,2. Akt Restaurant & Bar,Selnaustrasse 2
1,84182,25 Hours Hotel Zürich,"Pfingstweidstrasse 102, 8005 Zürich"
2,24800,2B Lounge,"Nüschelerstrasse 31, 8001 Zürich"
3,85693,3monkeys,"Alexander-Schönistrasse 17, 2502, Biel/Bienne"
4,82433,4. Akt,"Heinrichstrasse 262, 8005 Zürich"
5,34422,5ème Etage,"Mühlenplatz 11, Bern"
6,31409,Abraxas,"Chemin du Stand 5; Pully, 1009"
7,39607,Acanto,"Pfingstweidstrasse 6, 8005 Zürich"
8,102059,Acapella Bar,"Place de la Gare, 1957 Ardon, Valais, Suisse"
9,18481,Acqua Lounge Basilea,"Binningerstrasse 14; 4051, Basel"


Once we have the list of clubs, we need to get all the events for each club. For this, we create a parser that take the ClubID as argument and retrieve all events dates and line-up.

In [4]:
def getEventsFrom(clubIndex):
    '''Gets dataframe of events for a club'''
    ClubURL = "https://www.residentadvisor.net/club.aspx?id="+str(clubIndex)+""
    content = ""
    DFEvents = pd.DataFrame(columns = ['ClubID','EventID','EventName','Date','LineUp'])
    
    try:
        content = requests.get(ClubURL).text
    except:
        print("Error getting : "+ClubURL)
        return DFEvents
        
    soup = bs4.BeautifulSoup(content, "html5lib")
    
    eventIDList = list() #List of eventsID to be analysed
    
    #Parsing page to get eventIDs
    for sections in soup.findAll('section'):
        for links in sections.findAll('a'):
            if(len(links.attrs)==2 and 'itemprop' in links.attrs):
                link = links['href']
                eventID = link.split('?')[1]
                eventIDList.append(eventID) #Add eventID to list
    
    
    for eid in eventIDList:
        series = getSerieFromEvent(clubIndex,eid)
        DFEvents = DFEvents.append(series,ignore_index=True)
    
    DFEvents.ClubID = DFEvents.ClubID.astype(int)
    return DFEvents


In [5]:
def getSerieFromEvent(clubInd,eventID):
    '''Returns a Serie [EventID,EventName,Date,LineUp] for the eventID passed as argument'''
    EventURL = 'https://www.residentadvisor.net/event.aspx?'+str(eventID)
    content = ""
    
    try:
        content = requests.get(EventURL).text
    except:
        print("Error getting : "+EventURL)
        return None
    
    soup = bs4.BeautifulSoup(content, "html5lib")
    
    day = ''
    date = ''
    lineup = set()
    title = ''
    
    #Get LineUp
    for ul in soup.findAll('ul'):
        for div in ul.findAll('div'):
            if('Line' in div.text):
                for p in div.findAll('p'):
                    if('class' in p.attrs and p['class']!=None):
                        artists = p.text.split(',')
                        for a in artists:
                            splitted = a.split("\n")
                            for arts in splitted:
                                lineup.add(arts)
            
            #Deprecated#
            '''#Get Event date
            if('class' in div.attrs and div['class']!= ['clearfix'] and len(div.findAll('li'))>0):
                    for line in div.findAll('li'):
                        if('Date' in line.text):
                            for a in line:
                                if('-' in a): #Get time
                                     hr = a
                                if('y' in a): #Get weekday
                                    day = a
                                if(len(a)==1 and " /" not in a.text): #Get Date
                                    date = a
            '''
    
    #Get title and Date of the event
    for meta in soup.findAll('meta'):
        if('property' in meta.attrs ):
            if(meta['property']=='og:title'):
                title = meta['content']
            if(meta['property']=='og:description'):
                datestr = meta['content']
                val = datestr.split(",")
                date = val[1].split("-")[0]
                day = val[0]
    
    lineup = list(lineup)
    data =[int(clubInd), eventID, title,date,lineup]
    S = pd.Series(data,['ClubID','EventID','EventName','Date','LineUp'])

    return S

In [6]:
ClubID = int(55816)
eventID = 292693
getEventsFrom(ClubID)

Unnamed: 0,ClubID,EventID,EventName,Date,LineUp
0,55816,707003,Deeptown Music Showcase,19 September 2015,"[Affani, Mark Faermont]"
1,55816,753528,Mucho Stylez,12 September 2015,[Mucho Stylez]
2,55816,730155,Mucho Stylez,03 July 2015,[Mucho Stylez]
3,55816,713648,Zweiter Akt,13 June 2015,[Mucho Stylez]
4,55816,696650,Deeptown Night,21 March 2015,"[Mark Faermont, Sonny Dima]"
5,55816,671332,Deeptown Night,17 January 2015,"[Carlos Russo, Mark Faermont]"
6,55816,648424,Tonka UND 3333 Tage Saxer,29 November 2014,"[ Mark Faermont, Danny Coleman, Tonka]"
7,55816,642266,Deeptown Night,18 October 2014,"[Carlos Russo, David Eye]"
8,55816,619558,Deeptown Parade,02 August 2014,"[ Niall Redmond, Danny Coleman, Mark Faermon..."
9,55816,557604,Deeptown Music Night,18 January 2014,"[Mark Faermont, DJ Le Baron]"


## Creating CSV files for all dataframes of clubs

In [7]:
start = 0
total = len(clubDF.values)-start
i = 0

for d in clubDF.values[start:]:
    #Get Event Dataframe for this index
    index = d[0]
    df = getEventsFrom(index)
    #Add Columns Name and Adress
    df['ClubName'] = d[1]
    df['Address'] = d[2]
    #Reformat Dataframe
    df = df[['ClubID','ClubName','Address','EventID','EventName','Date','LineUp']]
    file_name = d[1]
    file_name = file_name.replace(u"/"," ").replace(u"*"," ")
    file_name = d[0]+"-"+file_name
    
    #Save as CSV
    try:
        df.to_csv('ClubData'+str(i%4)+'/'+file_name+".csv",sep ='\t', encoding='utf-8')
    except:
        print("ERROR AT INDEX"+str(i)+", index "+str(intdex)+", file_name = "+str(file_name))
    #console output
    i = i+1
    print(str(i)+"/"+str(total))

1/1314
2/1314
3/1314
4/1314
5/1314
6/1314
7/1314
8/1314
9/1314
10/1314
11/1314
12/1314
13/1314
14/1314
15/1314
16/1314
17/1314
18/1314
19/1314
20/1314
21/1314
22/1314
23/1314
24/1314
25/1314
26/1314
27/1314
28/1314
29/1314
30/1314
31/1314
32/1314
33/1314
34/1314
35/1314
36/1314
37/1314
38/1314
39/1314
40/1314
41/1314
42/1314
43/1314
44/1314
45/1314
46/1314
47/1314
48/1314
49/1314
50/1314
51/1314
52/1314
53/1314
54/1314
55/1314
56/1314
57/1314
58/1314
59/1314
60/1314
61/1314
62/1314
63/1314
64/1314
65/1314
66/1314
67/1314
68/1314
69/1314
70/1314
71/1314
72/1314
73/1314
74/1314
75/1314
76/1314
77/1314
78/1314
79/1314
80/1314
81/1314
82/1314
83/1314
84/1314
85/1314
86/1314
87/1314
88/1314
89/1314
90/1314
91/1314
92/1314
93/1314
94/1314
95/1314
96/1314
97/1314
98/1314
99/1314
100/1314
101/1314
102/1314
103/1314
104/1314
105/1314
106/1314
107/1314
108/1314
109/1314
110/1314
111/1314
112/1314
113/1314
114/1314
115/1314
116/1314
117/1314
118/1314
119/1314
120/1314
121/1314
122/1314
123/1314
1

ConnectionError: HTTPSConnectionPool(host='www.residentadvisor.net', port=443): Max retries exceeded with url: /event.aspx?286485 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x000001EA58D90DA0>: Failed to establish a new connection: [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu',))

In [None]:
#Testing load
lo = pd.read_csv('ClubData/'+file_name+".csv", sep='\t', index_col=0)
lo['Address']

In [None]:
print(clubDF.values[430])