In [127]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import io
import re
from datetime import date, timedelta

In [136]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [137]:
URL = 'https://zkm.de/en/exhibitions-events/current-exhibitions'
URL2 = 'https://events.microsoft.com/?timeperiod=next30Days&isSharedInLocalViewMode=false&country=Germany&city=Karlsruhe,%20Baden-W%C3%BCrttemberg,%20Germany'

In [170]:
re_pattern = '[0-9]{1,2}\\.[0-9]{1,2}\\.[20]*[1,2][0-9]'

In [139]:
raw_html = simple_get(URL)
soup = BeautifulSoup(raw_html, 'html.parser')
raw_html2 = simple_get(URL2)
soup2 = BeautifulSoup(raw_html, 'html.parser')

In [140]:
example_title = 'Writing the History of the Future'
example = soup.find(text=example_title)
example_class = example.parent.parent['class']

example_class

['teaser-grid__headline', 'bs-teaser-grid__headline']

In [156]:
matches = soup.find_all(attrs={'class': example_class})
titles = []
for match in matches:
    m = re.sub('\n', '', match.text)
    titles.append(m)

Open Codes
Negative Space
Writing the History of the Future
 Dieter Jung
zkm_gameplay. the next level
Open Codes. Living in Digital Worlds
Games and Politics
Games and Politics
Seasons of Media Arts
SCHLOSSLICHTSPIELE 2019
Edge of Now
SCHLOSSLICHTSPIELE 2019
The Whole World a Bauhaus


In [157]:
example.parent.parent.parent

<div class="card__text">
<div class="teaser-grid__headline bs-teaser-grid__headline">
<a href="/en/exhibition/2019/02/writing-the-history-of-the-future">Writing the History of the Future</a>
</div>
<div class="teaser-grid__content editor-content bs-teaser-grid__content">
                Sat, 23.02.2019 – Sun, 28.03.2021<br/><a href="/en/location/atrium-89" title="Location: Atrium 8+9">Atrium 8+9</a><br/>Cost: Free admission              </div>
</div>

In [158]:
example = example.parent

for i in range(5):
    match = re.findall(re_pattern, example.text)
    if match:
        break
    else:
        example = example.parent
i, example

(2, <div class="card__text">
 <div class="teaser-grid__headline bs-teaser-grid__headline">
 <a href="/en/exhibition/2019/02/writing-the-history-of-the-future">Writing the History of the Future</a>
 </div>
 <div class="teaser-grid__content editor-content bs-teaser-grid__content">
                 Sat, 23.02.2019 – Sun, 28.03.2021<br/><a href="/en/location/atrium-89" title="Location: Atrium 8+9">Atrium 8+9</a><br/>Cost: Free admission              </div>
 </div>)

In [163]:
def find_date(title):
    div = soup.find(text=title)
    
    div = div.parent

    for i in range(5):
        match = re.findall(re_pattern, div.text)
        if match:
            break
        else:
            div = div.parent
            
    example_date = div.find(text = re.compile(re_pattern))
    example_date = re.findall(re_pattern, example_date)
    
    if len(example_date) == 1:
        example_date = example_date[0]

    if len(example_date)==2:
        date_1 = date(*[int(x) for x in reversed(example_date[0].split('.'))])
        date_2 = date(*[int(x) for x in reversed(example_date[1].split('.'))]) 

        if(date_2-date_1 > timedelta(0)):
            example_date = ' - '.join(example_date)
            
    return example_date

In [172]:
for title in titles:
    print("{}: \t{}".format(title, find_date(title)))

Open Codes: 	20.10.2017 - 02.06.2019
Negative Space: 	06.04.2019 - 11.08.2019
Writing the History of the Future: 	23.02.2019 - 28.03.2021
 Dieter Jung: 	09.02.2019 - 02.06.2019
zkm_gameplay. the next level: 	29.09.2018 - 31.12.2021
Open Codes. Living in Digital Worlds: 	13.03.2019 - 09.08.2019
Games and Politics: 	05.04.2019 - 18.05.2019
Games and Politics: 	05.04.2019 - 18.05.2019
Seasons of Media Arts: 	08.08.2019 - 15.09.2019
SCHLOSSLICHTSPIELE 2019: 	08.08.2019 - 15.09.2019
Edge of Now: 	29.06.2019 - 27.10.2019
SCHLOSSLICHTSPIELE 2019: 	08.08.2019 - 15.09.2019
The Whole World a Bauhaus: 	26.10.2019 - 16.02.2020


In [169]:
for title in titles:
    print(f"{title}: \t{find_date(title)}")

Open Codes: 	20.10.2017 - 02.06.2019
Negative Space: 	06.04.2019 - 11.08.2019
Writing the History of the Future: 	23.02.2019 - 28.03.2021
 Dieter Jung: 	09.02.2019 - 02.06.2019
zkm_gameplay. the next level: 	29.09.2018 - 31.12.2021
Open Codes. Living in Digital Worlds: 	13.03.2019 - 09.08.2019
Games and Politics: 	05.04.2019 - 18.05.2019
Games and Politics: 	05.04.2019 - 18.05.2019
Seasons of Media Arts: 	08.08.2019 - 15.09.2019
SCHLOSSLICHTSPIELE 2019: 	08.08.2019 - 15.09.2019
Edge of Now: 	29.06.2019 - 27.10.2019
SCHLOSSLICHTSPIELE 2019: 	08.08.2019 - 15.09.2019
The Whole World a Bauhaus: 	26.10.2019 - 16.02.2020


In [96]:
example_date = example.find(text = re.compile(re_pattern))
example_date = re.findall(re_pattern, example_date)

In [159]:
if len(example_date) == 1:
    example_date = example_date[0]

if len(example_date)==2:
    date_1 = date(*[int(x) for x in reversed(example_date[0].split('.'))])
    date_2 = date(*[int(x) for x in reversed(example_date[1].split('.'))]) 
    
    if(date_2-date_1 > timedelta(0)):
        example_date = ' - '.join(example_date)

In [160]:
example_date

'23.02.2019 - 28.03.2021'