In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [3]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [4]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [38]:
base_url = 'https://www.fifa.com'
archive_ext = '/fifa-tournaments/archive/worldcup/index.html'

In [39]:
raw_html = simple_get(base_url + archive_ext)

In [40]:
soup = BeautifulSoup(raw_html, 'html.parser')

In [41]:
world_cups = soup.find_all('li', class_='comp-item')#select('li.comp-item')

In [42]:
wc_ext = world_cups[0].select('a')[0]['href']

In [44]:
wc_ext

'/worldcup/archive/brazil2014/index.html'

In [45]:
match_urls = ['https://www.fifa.com/worldcup/archive/southafrica2010/matches/index.html',
              'https://www.fifa.com/worldcup/archive/brazil2014/matches/index.html']

In [46]:
raw_html = simple_get(match_urls[0])

In [47]:
soup = BeautifulSoup(raw_html, 'html.parser')

In [54]:
matches = soup.find_all('div', class_='match-list-date')

In [56]:
for match in matches:
    print(match.select('a')[0]['href'])

/worldcup/matches/round=249722/match=300061454/index.html#nosticky
/worldcup/matches/round=249722/match=300061459/index.html#nosticky
/worldcup/matches/round=249722/match=300061465/index.html#nosticky
/worldcup/matches/round=249722/match=300061478/index.html#nosticky
/worldcup/matches/round=249722/match=300061483/index.html#nosticky
/worldcup/matches/round=249722/match=300061495/index.html#nosticky
/worldcup/matches/round=249722/match=300061458/index.html#nosticky
/worldcup/matches/round=249722/match=300061470/index.html#nosticky
/worldcup/matches/round=249722/match=300111117/index.html#nosticky
/worldcup/matches/round=249722/match=300061481/index.html#nosticky
/worldcup/matches/round=249722/match=300061487/index.html#nosticky
/worldcup/matches/round=249722/match=300061450/index.html#nosticky
/worldcup/matches/round=249722/match=300061462/index.html#nosticky
/worldcup/matches/round=249722/match=300061480/index.html#nosticky
/worldcup/matches/round=249722/match=300111111/index.html#nost

In [58]:
match_ext = matches[0].select('a')[0]['href']

In [60]:
raw_html = simple_get(base_url + match_ext)

In [61]:
soup = BeautifulSoup(raw_html, 'html.parser')