# Parse International matches
This is based on the code provided by https://github.com/sanand0/fifadata
This code needs to be run on Python 2.7

In [1]:
# import packages
import os
import re
import hashlib
import requests
from lxml.html import parse
import pandas as pd

In [2]:
# define worker functions
if not os.path.exists('.cache'):
    os.makedirs('.cache')

ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36'
session = requests.Session()

def get(url):
    '''Return cached lxml tree for url'''
    path = os.path.join('.cache', hashlib.md5(url).hexdigest() + '.html')
    if not os.path.exists(path):
        response = session.get(url, headers={'User-Agent': ua})
        with open(path, 'w') as fd:
            fd.write(response.text.encode('utf-8'))
    return parse(open(path))

def process(page,comp):
    headers = 'edition,year,venue,round,team1,team2,score'.split(',')
    if comp == 'WorldCup':
        url = 'http://www.linguasport.com/futbol/internacional/mundial/seekff.asp'
    if comp == 'Olympic':
        url = 'http://www.linguasport.com/futbol/internacional/olimpiadas/seekff.asp'        
    if page > 1:
        url += '?pn=%d' % page
    tree = get(url)
    count = 0
    for row in tree.findall('.//tr')[1:]:
        cells = [cell.text_content().strip() for cell in row.findall('.//td')]
        if len(cells) == 7:
            match = dict(zip(headers, cells))
            match['url'] = row.find('.//a').get('href')
            result.append(match)
            count += 1
    return count

def scrape(edn):
    #edition='WorldCup'
    # process worldcup data and save to dataframe
    page = 1
    while True:
        count = process(page,comp=edn)
        if count == 0:
            break
        else:
            page += 1
    data = pd.DataFrame(result)
    data['competition'] = edn
    return data

In [3]:
# scrape for WorldCup results
result = []
data_wc = scrape('WorldCup')
data_wc.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,competition
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,WorldCup
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,WorldCup
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,WorldCup
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,WorldCup
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,WorldCup


In [4]:
#scrape for Olympic results
result = []
data_oly = scrape('Olympic')
data_oly.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,competition
0,1908-LONDON,FIRST,(w.o.),Netherlands (Nederland),Hungary (Magyarország),1908_LONDON.htm,London.,1908,Olympic
1,1912-STOCKHOLM,1/2_FINAL,4-0 (2-0),Great Britain,Finland (Suomi),1912_STOCKHOLM_GD.htm#14-OT-12-III,Stockholm.,1912,Olympic
2,1912-STOCKHOLM,1/2_FINAL,1-4 (0-3),Netherlands (Nederland),Denmark (Danmark),1912_STOCKHOLM_GD.htm#15-OT-12-III,Stockholm.,1912,Olympic
3,1912-STOCKHOLM,PLACES_3&4,9-0 (4-0),Netherlands (Nederland),Finland (Suomi),1912_STOCKHOLM_GD.htm#16-OT-12-IV,Stockholm.,1912,Olympic
4,1912-STOCKHOLM,FINAL,2-4 (1-4),Denmark (Danmark),Great Britain,1912_STOCKHOLM_GD.htm#17-OT-12-V,Stockholm.,1912,Olympic


In [5]:
#concatenate the data and store in csv
data = pd.concat([data_wc,data_oly],ignore_index=True)
data.to_csv('./data/matches.csv', index=False, encoding='utf-8')

In [6]:
# pre process the data
df = pd.read_csv('./data/matches.csv')
df.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,competition
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,WorldCup
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,WorldCup
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,WorldCup
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,WorldCup
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,WorldCup


In [7]:
# check dataset
print set(df['competition'])
print ['WorldCup',min(df['year'][df['competition'] == 'WorldCup']), max(df['year'][df['competition'] == 'WorldCup'])]
print ['Olympic', min(df['year'][df['competition'] == 'Olympic']), max(df['year'][df['competition'] == 'Olympic'])]

set(['WorldCup', 'Olympic'])
['WorldCup', 1930, 2018]
['Olympic', 1908, 2016]


In [8]:
# editing of score
df['score_only'] = [ s.split(" ")[0]   for s in df['score'] ]

In [9]:
excl = []
for s in df['score_only']:
    try:
        int(s.split("-")[0])
    except:
        excl.append(s)
excl = list(set(excl))

In [10]:
# remove 2018 Russia data as this is still empty
df = df[df['year'] != 2018]

In [11]:
# remove matches with missing scores
df = df[~df['score_only'].isin(excl)]

In [12]:
# team scores and goal difference
df['team1_score']= [ int(s.split("-")[0])   for s in df['score_only'] ]
df['team2_score']= [ int(s.split("-")[1])  for s in df['score_only'] ]
df['goal_diff'] = df['team1_score']-df['team2_score']

In [13]:
# edit team names
def getTeamEdit(COL):
    t1 = []
    for s in COL:
        try:
            t1.append(s.split(" (")[0])
        except:
            t1.append(s)
    return(t1)
df['team1_edit'] = getTeamEdit(df['team1'])
df['team2_edit'] = getTeamEdit(df['team2'])

In [14]:
# export labels 
labels = ['competition','year','edition','round','team1_edit','team2_edit','team1_score','team2_score','score_only','goal_diff']
df1 = df[[c for c in df.columns if c in labels]]

In [15]:
# save csv
df1.to_csv('./data/matches_cleaned.csv')