# Resident Advisor Extractor

This class aim to extract the dataframe of all events in Resident Advisor and create DataFrame such as : Club Location, Event Per Clubs, Club Attendance...

## Retrieving the data

In [2]:
import numpy as np
import pandas as pd
import requests
import bs4

In [13]:
baseURL = "https://www.residentadvisor.net/"
clubListURL = "https://www.residentadvisor.net/clubs.aspx"
content = requests.get(clubListURL).text

soup = bs4.BeautifulSoup(content, "html5lib")

clubDF = pd.DataFrame(columns = ["ClubID","Name","Address"])

for tdTag in soup.findAll('li'):
    for clubs in tdTag.findAll('li'):
        if('class' in clubs.attrs and len(clubs)==2):
            #Parsing infos
            clubID = "null"
            address = "null"
            name = "null"
            for line in clubs:
                if(line.a == None):
                    address = line.text
                else:
                    name = line.text
                    clubID = line.a['href'].split("=")[1]
            #print(str(clubID)+",,"+str(adress)+",,"+str(name))
            df = pd.Series([clubID,name,address],['ClubID','Name','Address'])
            clubDF = clubDF.append(df, ignore_index=True)
            #print(df) 
clubDF.head(250)

Unnamed: 0,ClubID,Name,Address
0,55816,2. Akt Restaurant & Bar,Selnaustrasse 2
1,84182,25 Hours Hotel Zürich,"Pfingstweidstrasse 102, 8005 Zürich"
2,24800,2B Lounge,"Nüschelerstrasse 31, 8001 Zürich"
3,85693,3monkeys,"Alexander-Schönistrasse 17, 2502, Biel/Bienne"
4,82433,4. Akt,"Heinrichstrasse 262, 8005 Zürich"
5,34422,5ème Etage,"Mühlenplatz 11, Bern"
6,31409,Abraxas,"Chemin du Stand 5; Pully, 1009"
7,39607,Acanto,"Pfingstweidstrasse 6, 8005 Zürich"
8,102059,Acapella Bar,"Place de la Gare, 1957 Ardon, Valais, Suisse"
9,18481,Acqua Lounge Basilea,"Binningerstrasse 14; 4051, Basel"


Once we have the list of clubs, we need to get all the events for each club. For this, we create a parser that take the ClubID as argument and retrieve all events dates and line-up.

In [4]:
def getEventsFrom(clubIndex):
    '''Gets dataframe of events for a club'''
    ClubURL = "https://www.residentadvisor.net/club.aspx?id="+str(clubIndex)+""
    content = ""
    DFEvents = pd.DataFrame(columns = ['ClubID','EventID','EventName','Date','LineUp'])
    
    try:
        content = requests.get(ClubURL).text
    except:
        print("Error getting : "+ClubURL)
        return DFEvents
        
    soup = bs4.BeautifulSoup(content, "html5lib")
    
    eventIDList = list() #List of eventsID to be analysed
    
    #Parsing page to get eventIDs
    for sections in soup.findAll('section'):
        for links in sections.findAll('a'):
            if(len(links.attrs)==2 and 'itemprop' in links.attrs):
                link = links['href']
                eventID = link.split('?')[1]
                eventIDList.append(eventID) #Add eventID to list
    
    
    for eid in eventIDList:
        series = getSerieFromEvent(clubIndex,eid)
        DFEvents = DFEvents.append(series,ignore_index=True)
    
    DFEvents.ClubID = DFEvents.ClubID.astype(int)
    return DFEvents


In [5]:
def getSerieFromEvent(clubInd,eventID):
    '''Returns a Serie [EventID,EventName,Date,LineUp] for the eventID passed as argument'''
    EventURL = 'https://www.residentadvisor.net/event.aspx?'+str(eventID)
    content = ""
    
    try:
        content = requests.get(EventURL).text
    except:
        print("Error getting : "+EventURL)
        return None
    
    soup = bs4.BeautifulSoup(content, "html5lib")
    
    day = ''
    date = ''
    lineup = set()
    title = ''
    
    #Get LineUp
    for ul in soup.findAll('ul'):
        for div in ul.findAll('div'):
            if('Line' in div.text):
                for p in div.findAll('p'):
                    if('class' in p.attrs and p['class']!=None):
                        artists = p.text.split(',')
                        for a in artists:
                            splitted = a.split("\n")
                            for arts in splitted:
                                lineup.add(arts)
            
            #Deprecated#
            '''#Get Event date
            if('class' in div.attrs and div['class']!= ['clearfix'] and len(div.findAll('li'))>0):
                    for line in div.findAll('li'):
                        if('Date' in line.text):
                            for a in line:
                                if('-' in a): #Get time
                                     hr = a
                                if('y' in a): #Get weekday
                                    day = a
                                if(len(a)==1 and " /" not in a.text): #Get Date
                                    date = a
            '''
    
    #Get title and Date of the event
    for meta in soup.findAll('meta'):
        if('property' in meta.attrs ):
            if(meta['property']=='og:title'):
                title = meta['content']
            if(meta['property']=='og:description'):
                datestr = meta['content']
                val = datestr.split(",")
                date = val[1].split("-")[0]
                day = val[0]
    
    lineup = list(lineup)
    data =[int(clubInd), eventID, title,date,lineup]
    S = pd.Series(data,['ClubID','EventID','EventName','Date','LineUp'])

    return S

In [14]:
ClubID = int(8907)
eventID = 292693
getEventsFrom(ClubID)

Unnamed: 0,ClubID,EventID,EventName,Date,LineUp
0,8907,876775,Carre Blanc x Adana Twins (Friso),10 September 2016,"[Deep White, Adana Twins (Friso x Exploited) ]"
1,8907,800014,Mininova with Marc Houle *Live*,27 February 2016,"[Marc Houle, M-United, Sebastian Bosco]"
2,8907,749638,Fritz Kalkbrenner,07 November 2015,[Fritz Kalkbrenner]
3,8907,754179,Mininova with Andrea Roma,26 September 2015,"[Andrea Roma , David Kiesel, Never Die ]"
4,8907,757817,Die Brücke & Friends,24 September 2015,"[, [LINE UP], JR – (Le Jour Me Nuit /CH), Dris..."
5,8907,691669,Mininova with Matador for Play with Me Tour an...,13 May 2015,"[Dj Never Die (CH), Matador (IE), Sebastian Bo..."
6,8907,698294,Mininova with Traumer,18 April 2015,"[Sebastian Bosco, Several Definitions (Knee De..."
7,8907,691664,Mininova with Avrosse B2B Louie Cut,28 March 2015,"[Dj Never Die, Avrosse B2B Louie Cut, Sebastia..."
8,8907,681167,Mininova with Patrik Soderbom,07 February 2015,"[David Kiesel, Patrik Soderbom (SWE), Sebastia..."
9,8907,647003,La Résidance with DJ W!ld: When You Feel Me Re...,18 October 2014,"[ Jokari (Kartel Klub - CH), DJ W!ld (Circoloc..."


## Creating CSV files for all dataframes of clubs

In [None]:
start = 0
total = len(clubDF.values)-start
i = 0

for d in clubDF.values[start:]:
    #Get Event Dataframe for this index
    index = d[0]
    df = getEventsFrom(index)
    #Add Columns Name and Adress
    df['ClubName'] = d[1]
    df['Address'] = d[2]
    #Reformat Dataframe
    df = df[['clubID','place','address','eventID','event','date','artists']]
    file_name = d[1]
    file_name = file_name.replace(u"/"," ").replace(u"*"," ")
    file_name = d[0]+"-"+file_name
    
    #Save as CSV
    try:
        df.to_csv('ClubData'+str(i%4)+'/'+file_name+".csv",sep ='\t', encoding='utf-8')
    except:
        print("ERROR AT INDEX"+str(i)+", index "+str(intdex)+", file_name = "+str(file_name))
    #console output
    i = i+1
    print(str(i)+"/"+str(total))

1/1314
2/1314
3/1314
4/1314
5/1314
6/1314
7/1314
8/1314
9/1314
10/1314
11/1314
12/1314
13/1314
14/1314
15/1314
16/1314


In [None]:
#Testing load
lo = pd.read_csv('ClubData/'+file_name+".csv", sep='\t', index_col=0)
lo['Address']

In [None]:
print(clubDF.values[430])