## events_ch_handler

In [1]:
from html.parser import HTMLParser
import urllib.request
import unidecode
from datetime import datetime

In [2]:
base_url = u"https://events.ch"
language = u"en"
action = u"search"
search_type = u"concerts"
date = u"2015-07-23"
end_search = u"6/cs"
page_number = 1

In [3]:
querry_url = base_url + '/' + language + '/' + action + '/' + search_type + '/' + date + '/' + end_search + '/' + str(page_number)
print(querry_url)

https://events.ch/en/search/concerts/2015-07-23/6/cs/1


In [4]:
page = urllib.request.urlopen(querry_url).read().decode('utf-8')

In [5]:
class MainHTMLParser(HTMLParser):
    def __init__(self):
        super( MainHTMLParser, self ).__init__()
        self.data = []
        
    def handle_starttag(self, tag, attrs):
        if(tag=='a' and len(attrs)==3):
            href=attrs[1][1]
            subquerry_url = base_url + urllib.parse.quote(href)
            subpage = urllib.request.urlopen(subquerry_url).read().decode('utf-8')
            subparser = EventHTMLParser()
            subparser.feed(subpage)
            split = href.split('/')
            (artists, date) = subparser.getData()
            currentDate = datetime.strptime(date, "%a %b %d %Y %H:%M:%S GMT%z (%Z)")
            location = split[4]
            festival = split[5]
            if(len(artists) > 0):
                self.data.append((location, festival, datetime.strftime(currentDate, "%Y-%m-%d"), artists))
    
    def getData(self):
        return self.data
    
class EventHTMLParser(HTMLParser):
    def __init__(self):
        super( EventHTMLParser, self ).__init__()
        self.artist_string = ""
        self.date = ""
        self.grab_artists = False
        self.artists = []
    
    def handle_starttag(self, tag, attrs):
        if(tag == 'h2' and len(attrs) == 2 and attrs[1][1] == "event-subtitle"):
            self.grab_artists = True
        if(tag == 'time' and attrs[0][0] == 'datetime' and self.date == ""):
            self.date = attrs[0][1]
        
    def handle_endtag(self, tag):
        self.grab_artists = False

    def handle_data(self, data):
        if(self.grab_artists == True):
            self.artist_string = data
    
    def cleanArtists(self):
        temp = self.artist_string.split(',')
        for s in temp:
            if('(' in s):
                self.artists.append(s.split('(')[0])
    
    def getData(self):
        self.cleanArtists()
        return (self.artists, self.date)

In [6]:
parser = MainHTMLParser()
parser.feed(page)
print(parser.getData())

[('Verbier', 'Verbier-Festival', '2015-07-23', ['Yekwon Sunwoo ']), ('Nyon', 'Paléo-Festival', '2015-07-23', ['Husbands ']), ('Basel', 'Basel-Tattoo', '2015-07-23', ['Ailsa Craig Highland Dancers ', ' Blue Devils International Corps ', ' Celtic Stars Irish Dancers ', ' Fanfara 8° Reggimento Bersaglieri ', ' Imps Motorcycle Display Team ']), ('Nyon', 'Paléo-Festival', '2015-07-23', ['Pascals ', ' The Paradise Bangkok Molam International Band ', ' A Moving Sound ']), ('Nyon', 'Paléo-Festival', '2015-07-23', ['Biga Ranx ', ' Chinese Man ', ' Anthony B ', ' Yaniss Odua ']), ('null', 'Colaris', '2015-07-23', ['Colaris '])]


In [10]:
def getEventsForDates(eventType, startDate, maxPage=5):
    data = []
    base_url = "https://events.ch/en/search"
    end_search = "6/cs"
    tempData = []
    page_number = 1
    (startYear, startMonth, startDay) = startDate.split('-')
    while(page_number <= maxPage):
        querry_url = base_url + '/' + eventType + '/' + startDate + '/' + end_search + '/' + str(page_number)
        page = urllib.request.urlopen(querry_url).read().decode('utf-8')
        parser = MainHTMLParser()
        parser.feed(page)
        tempData = parser.getData()
        for l in tempData:
            (location, festival, date_string, artists) = l
            (currentYear, currentMonth, currentDay) = date_string.split('-')
            if(len(tempData) <= 0 or page_number > maxPage or \
              (startYear <= currentYear and startMonth <= currentMonth and startDay < currentDay)):
                return data
            data = data + [(location, festival, date, artists)]
        print("page_number:" + str(page_number))
        page_number = page_number + 1
    return data
    
    

In [11]:
data = getEventsForDates("concerts", date)

page_number:1
page_number:2
page_number:3


In [12]:
import numpy as np
import pandas as pd

In [14]:
df = pd.DataFrame(data, columns=['location', 'event', 'date', 'artists'])

In [15]:
df

Unnamed: 0,location,event,date,artists
0,Verbier,Verbier-Festival,2015-07-23,[Yekwon Sunwoo ]
1,Nyon,Paléo-Festival,2015-07-23,[Husbands ]
2,Basel,Basel-Tattoo,2015-07-23,"[Ailsa Craig Highland Dancers , Blue Devils I..."
3,Nyon,Paléo-Festival,2015-07-23,"[Pascals , The Paradise Bangkok Molam Interna..."
4,Nyon,Paléo-Festival,2015-07-23,"[Biga Ranx , Chinese Man , Anthony B , Yani..."
5,,Colaris,2015-07-23,[Colaris ]
6,Nyon,Paléo-Festival,2015-07-23,"[Acid Arab , Faada Freddy , Mina Tindle ]"
7,Zürich,Salsamania,2015-07-23,[DJs Luis Salgado ]
8,Nyon,Paléo-Festival,2015-07-23,"[Johnny Hallyday , Ben Harper & The Innocent ..."
9,Verbier,Verbier-Festival,2015-07-23,"[ Denis Matsuev , Daniil Trifonov , conducto..."
