In [779]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import io
import re
from datetime import date, timedelta
from dateutil.parser import parse
import numpy as np

In [136]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [784]:
URL = 'https://zkm.de/en/exhibitions-events/current-exhibitions'
example_title = 'Writing the History of the Future'

In [766]:
URL = 'https://www.microsoft.com/de-de/techwiese/events/default.aspx?Veranstalter=Community'
example_title = 'Bau dir einen Serverless EventStore'

In [801]:
URL = 'https://mlh.io/seasons/eu-2019/events'
example_title = 'HackSheffield'

In [802]:
raw_html = simple_get(URL)
soup = BeautifulSoup(raw_html, 'html.parser')

In [803]:
for string in [example_title, example_title.upper(), example_title.lower(), 
               example_title.title(), example_title.capitalize()]:
    
    try:
        example = soup.find(text=string)
    except:
        example = None
    if example:
        break
        
if example:

    ex = example
    for i in range(5):
        try:
            example_class = ex['class']
            break
        except:
            ex = ex.parent
    title_pos = np.where(example_title == np.array(ex.text.split('\n')))[0][0]
else:
    example_class = None

In [804]:
example_class, title_pos

(['inner'], 3)

In [805]:
matches = soup.find_all(attrs={'class': example_class})
titles = []
for match in matches:
    m = match.text.split('\n')[title_pos]
    titles.append(m)

In [807]:
title = titles[0]
titles

['JUNCTIONxBudapest',
 'Hack.Moscow',
 'HackSheffield',
 'HackTheMidlands',
 'jacobsHack! 2018',
 'AstonHack',
 'HackSussex',
 'ExImpact',
 'DurHack',
 'Porticode 3.0',
 'Junction 2018',
 'HackNotts',
 'OxfordHack',
 'Local Hack Day',
 'Hack Cambridge 4D    ',
 'ManMetHacks',
 'Reboot Hack  ',
 'Royal Hackaway v2',
 'HackSurrey Mk2  ',
 'Hack The South',
 'R.U. Hacking?',
 'Hack The Burgh',
 '{FMI Codes} Code for Automation',
 'HackBordeaux',
 '//Slash Hackathon',
 'Hack Kosice',
 'HackMed 2019  ',
 'AUHack',
 'Hack Winterthur',
 'CopenHacks',
 'HackDelft',
 'UniHack',
 'DragonHack',
 'Hackaburg']

# Tests

In [808]:
def regex_date(string):
    
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    months_short = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
    months_de = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
    months_de_short = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez']
    
    months_patterns = ['[0-9]{1,2}[\.]?[\s]?' + '(' + '|'.join(months) + ')' + '.' + '(20[12][0-9])?', 
                       '[0-9]{1,2}[\.]?[\s]?' + '(' + '|'.join(months_de) + ')' + '.' + '(20[12][0-9])?',
                       '[0-9]{1,2}[\.]?[\s]?' + '(' + '|'.join(months_short) + ')' + '.' + '(20[12][0-9])?',
                       '(' + '|'.join(months) + ')' + '[\s]?[0-9]{1,2}(th|st|nd|rd)?[\s]?(20[12][0-9])?',
                       '(' + '|'.join(months_short) + ')' + '[\s]?[0-9]{1,2}(th|st|nd|rd)?[\s]?(20[12][0-9])?'
                      ]

    for pattern in months_patterns:
        if re.search(pattern, string):
            return re.search(pattern, string).group()
        
    re_patterns = ['[0-9]{1,2}\\.[0-9]{1,2}\\.20[1,2][0-9]',
                      '[0-9]{1,2}\\.[0-9]{1,2}\\.[1,2][0-9]',
                  '[0-9]{1,2}\\.[0-9]{1,2}.?']

    for pattern in re_patterns:
        if re.search(pattern, string):
            return re.search(pattern, string).group()
    return None

In [809]:
def find_date(title):
    div = soup.find(text=title)
    
    div = div.parent

    for i in range(10):
        match = regex_date(div.text)
        if match:
            break
        else:
            div = div.parent
            
    example_date = regex_date(div.text)
    if not example_date:
        example_date = None
        
    elif len(example_date) == 1:
        example_date = example_date[0]

    elif len(example_date)==2:
        date_1 = date(*[int(x) for x in reversed(example_date[0].split('.'))])
        date_2 = date(*[int(x) for x in reversed(example_date[1].split('.'))]) 

        if(date_2-date_1 > timedelta(0)):
            example_date = ' - '.join(example_date)
            
    return example_date

In [810]:
for title in titles:
    print("{}: \t{}".format(title, find_date(title)))
    print('\n')

JUNCTIONxBudapest: 	Oct 19th 


Hack.Moscow: 	Oct 26th 


HackSheffield: 	Oct 27th 


HackTheMidlands: 	Nov 3rd 


jacobsHack! 2018: 	18
Nov 


AstonHack: 	Nov 10th 


HackSussex: 	Nov 10th 


ExImpact: 	Nov 16th 


DurHack: 	Nov 17th 


Porticode 3.0: 	3.0


Junction 2018: 	18
Nov 


HackNotts: 	Nov 24th 


OxfordHack: 	Nov 24th 


Local Hack Day: 	Dec 1st



Hack Cambridge 4D    : 	Jan 19th 


ManMetHacks: 	Jan 26th 


Reboot Hack  : 	Feb 2nd 


Royal Hackaway v2: 	2
Feb 


HackSurrey Mk2  : 	Feb 9th 


Hack The South: 	Feb 9th 


R.U. Hacking?: 	Feb 16th 


Hack The Burgh: 	Mar 16th 


{FMI Codes} Code for Automation: 	Mar 22nd 


HackBordeaux: 	Mar 23rd 


//Slash Hackathon: 	Mar 29th 


Hack Kosice: 	Mar 30th 


HackMed 2019  : 	Mar 30th 


AUHack: 	Apr 5th 


Hack Winterthur: 	Apr 5th 


CopenHacks: 	Apr 6th 


HackDelft: 	May 11th 


UniHack: 	May 17th 


DragonHack: 	May 18th 


Hackaburg: 	May 24th 




In [676]:
test_cases = ['10.19.2019', '10. Mai 2019', '10. May 2019', '10. Mai, 2020', 
              '10 Jan', '10. Jan', '10. Januar', '12.10', '12.10.', 
             'May 14th', 'January 2nd 2019', 'Jan 3rd 2020']

for test_case in test_cases:
    print("{} -> {}".format(test_case, find_date(test_case)))

10.19.2019


AttributeError: 'NoneType' object has no attribute 'parent'

In [675]:
for test_case in test_cases:
    
#     for en, de in zip(months, months_de):
#         if de in test_case:
#             test_case = re.sub(de, en, test_case)
            
    
    try:
        print("{} -> {}".format(test_case, parse(test_case)))
    except:
        print("{} -> None".format(test_case))

10.19.2019 -> 2019-10-19 00:00:00
10. Mai 2019 -> None
10. May 2019 -> 2019-05-10 00:00:00
10. Mai, 2020 -> None
10 Jan -> 2019-01-10 00:00:00
10. Jan -> 2019-01-10 00:00:00
10. Januar -> None
12.10 -> 2019-05-12 00:00:00
12.10. -> None
May 14th -> 2019-05-14 00:00:00
January 2nd 2019 -> 2019-01-02 00:00:00
Jan 3rd 2020 -> 2020-01-03 00:00:00


In [605]:
div = soup.find(text=title)
div = div.parent

for i in range(5):
    match = find_date(div.text)
    if match:
        break
    else:
        div = div.parent


'26.10.2019'