## Kaggle European Soccer Database Analysis
### Extract Ball Event Data from Match Table

The data are downloaded here: [Kaggle European Soccer Database](https://www.kaggle.com/hugomathien/soccer).

This notebook is to extract ball-event data from match table. The ball events are stored in match table with xml style, so it has to be extracted for further data analysis.

### Table of Contents
#### 1. [Extract Goal Event](#1)
#### 2. [Extract Shoton Event](#2)
#### 3. [Extract Shotoff Event](#3)
#### 4. [Extract Foulcommit Event](#4)
#### 5. [Extract Card Event](#5)
#### 6. [Extract Cross Event](#6)
#### 7. [Extract Corner Event](#7)
#### 8. [Extract Possession Event](#8)

In [4]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sqlite3
import pickle
from tqdm import tqdm
import os
import bs4

sns.set_style('whitegrid')
sns.set_context('notebook')

print('All general modules are imported.')

All general modules are imported.


In [5]:
# Establish connection to sql database

# database file name
db_filename = 'database.sqlite'

# change database file directory as you wish
db_filedir = os.path.join(os.path.pardir, os.path.pardir, os.path.pardir, 'data_source', 'kaggle', db_filename)

try:
    con=sqlite3.connect(db_filedir)
    print('Connection to the database is established.\n')
except Exception as e:
    print('Unable to establish the connection.')

cursor=con.execute("select name from sqlite_master where type='table'")
print('Following Tables are found in the database:')
for i in cursor.fetchall():
    print('{}'.format(i[0]))

Connection to the database is established.

Following Tables are found in the database:
sqlite_sequence
Player_Attributes
Player
Match
League
Country
Team
Team_Attributes


In [6]:
# read match table
try:
    match = pd.read_sql_query('select * from Match',con)
    print('Successfully load match table from the database.')
except Exception as e:
    print('Unable to load match table from the database.')

Successfully load match table from the database.


In [7]:
# Change the folder directory as you wish to store the exported data files
export_folder = os.path.join(os.path.pardir, os.path.pardir, os.path.pardir, 'data_source', 'kaggle')

#### <a id='1'></a>1. Goal

In [4]:
# Turn this on if you want to parse goal information
parse_goal = False

if parse_goal:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    goal_type_list=[]
    player1_list=[]
    player2_list=[]
    team_list=[]
    elapsed_list=[]
    longitude_list=[]
    latitude_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['goal'].notnull()].shape[0]), desc='Extracting Goal Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['goal'].notnull()].iloc[i]['goal'],'lxml')
        
        # Loop over all events
        for element in soup.goal.find_all('value',recursive=False):
            
            # Get id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type    
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get goal type
            try:
                goal_type_list.append(element.find('goal_type').text)
            except AttributeError:
                goal_type_list.append(np.nan)
            
            # Get player 1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get player 2    
            try:
                player2_list.append(element.find('player2').text)
            except AttributeError:
                player2_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(element.find(np.nan))
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(element.find(np.nan)) 
            
            # Get longitude
            try:
                longitude_list.append(element.find_all('value')[0].text)
            except IndexError:
                longitude_list.append(np.nan)
            
            # Get latitude
            try:
                latitude_list.append(element.find_all('value')[1].text)
            except IndexError:
                latitude_list.append(np.nan)
            
            # Get match id    
            match_id_list.append(match[match['goal'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    goal_dic = {'id': id_list,
                'type': type_list,
                'subtype': subtype_list,
                'goal_type': goal_type_list,
                'player1': player1_list,
                'player2': player2_list,
                'team': team_list,
                'elapsed': elapsed_list,
                'longitude':longitude_list,
                'latitude':latitude_list,
                'match_api_id':match_id_list,
                'elapsed_plus':elapsed_plus_list}
    
    # Create a dataframe
    goal = pd.DataFrame(goal_dic)
    
    # Change data types to be consistent with database
    goal.elapsed = pd.to_numeric(goal.elapsed, errors='coerce')
    goal.elapsed_plus = pd.to_numeric(goal.elapsed_plus, errors='coerce')
    goal.id = pd.to_numeric(goal.id, errors='coerce')
    goal.latitude = pd.to_numeric(goal.latitude, errors='coerce')
    goal.longitude = pd.to_numeric(goal.longitude, errors='coerce')
    goal.match_api_id = pd.to_numeric(goal.match_api_id, errors='coerce')
    goal.player1 = pd.to_numeric(goal.player1, errors='coerce')
    goal.player2 = pd.to_numeric(goal.player2, errors='coerce')
    goal.team = pd.to_numeric(goal.team, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'goal.data')):
        print('Saving goal dataset to file...')
        try:
            with open(os.path.join(export_folder, 'goal.data'), 'wb') as f:
                pickle.dump(goal, f)
            print('Goal dataset has been saved to goal.data')
        except Exception as e:
            print('Unable to save data to: goal.data')
    else:
        print('goal dataset is cached in file: goal.data.')

#### <a id='2'></a>2. Shoton

In [5]:
# Turn this on if you want to parse shoton information
parse_shoton = False

if parse_shoton:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    player1_list=[]
    team_list=[]
    elapsed_list=[]
    elapsed_plus_list=[]
    longitude_list=[]
    latitude_list=[]
    match_id_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['shoton'].notnull()].shape[0]), desc='Extracting Shoton Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['shoton'].notnull()].iloc[i]['shoton'],'lxml')
        
        # Loop over all events
        for element in soup.shoton.find_all('value',recursive=False):
            # Get id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
                
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get player 1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(element.find(np.nan))
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(element.find(np.nan))
            
            # Get longitude
            try:
                longitude_list.append(element.find_all('value')[0].text)
            except IndexError:
                longitude_list.append(np.nan)
            
            # Get latitude
            try:
                latitude_list.append(element.find_all('value')[1].text)
            except IndexError:
                latitude_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['shoton'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary        
    shoton_dic = {'id': id_list,
                  'type': type_list,
                  'subtype': subtype_list,
                  'player1': player1_list,
                  'team': team_list,
                  'elapsed': elapsed_list,
                  'elapsed_plus':elapsed_plus_list,
                  'longitude':longitude_list,
                  'latitude':latitude_list,
                  'match_api_id':match_id_list}
    
    # Create a dataframe
    shoton = pd.DataFrame(shoton_dic)
    
    # Change data types to be consistent with database
    shoton.elapsed = pd.to_numeric(shoton.elapsed, errors='coerce')
    shoton.elapsed_plus = pd.to_numeric(shoton.elapsed_plus, errors='coerce')
    shoton.id = pd.to_numeric(shoton.id, errors='coerce')
    shoton.latitude = pd.to_numeric(shoton.latitude, errors='coerce')
    shoton.longitude = pd.to_numeric(shoton.longitude, errors='coerce')
    shoton.player1 = pd.to_numeric(shoton.player1, errors='coerce')
    shoton.team = pd.to_numeric(shoton.team, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'shoton.data')):
        print('Saving shoton dataset to file...')
        try:
            with open(os.path.join(export_folder, 'shoton.data'), 'wb') as f:
                pickle.dump(shoton, f)
            print('shoton dataset has been saved to shoton.data')
        except Exception as e:
            print('Unable to save data to: shoton.data')
    else:
        print('shoton dataset is cached in file: shoton.data.')

#### <a id='3'></a>3. Shotoff

In [6]:
# Turn this on if you want to shotoff goal information
parse_shotoff = False

if parse_shotoff:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    player1_list=[]
    team_list=[]
    elapsed_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    longitude_list=[]
    latitude_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['shotoff'].notnull()].shape[0]), desc='Extracting Shotoff Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['shotoff'].notnull()].iloc[i]['shotoff'],'lxml')
        
        # Loop over all events
        for element in soup.shotoff.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get player1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)
            
            # Get longitude
            try:
                longitude_list.append(element.find_all('value')[0].text)
            except IndexError:
                longitude_list.append(np.nan)
            
            # Get latitude
            try:
                latitude_list.append(element.find_all('value')[1].text)
            except IndexError:
                latitude_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['shotoff'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    shotoff_dic = {'id': id_list,
                   'type': type_list,              
                   'subtype': subtype_list,              
                   'player1': player1_list,              
                   'team': team_list,
                   'elapsed': elapsed_list,
                   'elapsed_plus':elapsed_plus_list,
                   'match_api_id':match_id_list,
                   'longitude':longitude_list,
                   'latitude':latitude_list}
    
    # Create a dataframe
    shotoff = pd.DataFrame(shotoff_dic)
    
    # Change data types to be consistent with database
    shotoff.elapsed = pd.to_numeric(shotoff.elapsed, errors='coerce')
    shotoff.elapsed_plus = pd.to_numeric(shotoff.elapsed_plus, errors='coerce')
    shotoff.id = pd.to_numeric(shotoff.id, errors='coerce')
    shotoff.latitude = pd.to_numeric(shotoff.latitude, errors='coerce')
    shotoff.longitude = pd.to_numeric(shotoff.longitude, errors='coerce')
    shotoff.player1 = pd.to_numeric(shotoff.player1, errors='coerce')
    shotoff.team = pd.to_numeric(shotoff.team, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'shotoff.data')):
        print('Saving shotoff dataset to file...')
        try:
            with open(os.path.join(export_folder, 'shotoff.data'), 'wb') as f:
                pickle.dump(shotoff, f)
            print('shotoff dataset has been saved to shotoff.data')
        except Exception as e:
            print('Unable to save data to: shotoff.data')
    else:
        print('shotoff dataset is cached in file: shotoff.data.')

#### <a id='4'></a>4. Foulcommit

In [7]:
# Turn this on if you want to parse foulcommit information
parse_foulcommit = False

if parse_foulcommit:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    player1_list=[]
    player2_list=[]
    team_list=[]
    elapsed_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    subtype_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['foulcommit'].notnull()].shape[0]), desc='Extracting Foulcommit Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['foulcommit'].notnull()].iloc[i]['foulcommit'],'lxml')
        
        # Loop over all events
        for element in soup.foulcommit.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)        
            
            # Get player2
            try:
                player2_list.append(element.find('player2').text)
            except AttributeError:
                player2_list.append(np.nan)
            
            # Get player1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['foulcommit'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    foulcommit_dic = {'id': id_list,                
                      'type': type_list,         
                      'player1': player1_list,
                      'player2': player2_list,                
                      'team': team_list,
                      'elapsed': elapsed_list,          
                      'elapsed_plus':elapsed_plus_list,
                      'subtype': subtype_list,
                      'match_api_id':match_id_list}
    
    # Create a dataframe
    foulcommit = pd.DataFrame(foulcommit_dic)
    
    # Change data types to be consistent with database
    foulcommit.elapsed = pd.to_numeric(foulcommit.elapsed, errors='coerce')
    foulcommit.elapsed_plus = pd.to_numeric(foulcommit.elapsed_plus, errors='coerce')
    foulcommit.id = pd.to_numeric(foulcommit.id, errors='coerce')
    foulcommit.player1 = pd.to_numeric(foulcommit.player1, errors='coerce')
    foulcommit.player2 = pd.to_numeric(foulcommit.player2, errors='coerce')
    foulcommit.team = pd.to_numeric(foulcommit.team, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'foulcommit.data')):
        print('Saving foulcommit dataset to file...')
        try:
            with open(os.path.join(export_folder, 'foulcommit.data'), 'wb') as f:
                pickle.dump(foulcommit, f)
            print('foulcommit dataset has been saved to foulcommit.data')
        except Exception as e:
            print('Unable to save data to: foulcommit.data')
    else:
        print('foulcommit dataset is cached in file: foulcommit.data.')

#### <a id='5'></a>5. Card

In [8]:
# Turn this on if you want to parse card information
parse_card = False

if parse_card:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    player1_list=[]
    team_list=[]
    elapsed_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    card_type_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['card'].notnull()].shape[0]), desc='Extracting Card Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['card'].notnull()].iloc[i]['card'],'lxml')
        
        # Loop over all events
        for element in soup.card.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get card type
            try:
                card_type_list.append(element.find('card_type').text)
            except AttributeError:
                card_type_list.append(np.nan)
            
            # Get player1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['card'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    card_dic = {'id': id_list,
                'type': type_list,
                'subtype':subtype_list,
                'cardtype':card_type_list,
                'player1': player1_list,                                 
                'team': team_list,                  
                'elapsed': elapsed_list,                           
                'elapsed_plus':elapsed_plus_list,                               
                'match_api_id':match_id_list}
    
    # Create a dataframe
    card = pd.DataFrame(card_dic)
    
    # Change data types to be consistent with database
    card.elapsed = pd.to_numeric(card.elapsed, errors='coerce')
    card.elapsed_plus = pd.to_numeric(card.elapsed_plus, errors='coerce')
    card.id = pd.to_numeric(card.id, errors='coerce')
    card.player1 = pd.to_numeric(card.player1, errors='coerce')
    card.team = pd.to_numeric(card.team, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'card.data')):
        print('Saving card dataset to file...')
        try:
            with open(os.path.join(export_folder, 'card.data'), 'wb') as f:
                pickle.dump(card, f)
            print('card dataset has been saved to card.data')
        except Exception as e:
            print('Unable to save data to: card.data')
    else:
        print('card dataset is cached in file: card.data.')

#### <a id='6'></a>6. Cross

In [9]:
# Turn this on if you want to parse cross information
parse_cross = False

if parse_cross:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    player1_list=[]
    team_list=[]
    elapsed_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['cross'].notnull()].shape[0]), desc='Extracting Cross Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['cross'].notnull()].iloc[i]['cross'],'lxml')
        
        # Loop over all events
        for element in soup.cross.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get player1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['cross'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    cross_dic = {'id': id_list,            
                 'type': type_list,            
                 'subtype':subtype_list,            
                 'player1': player1_list,
                 'team': team_list,                
                 'elapsed_plus':elapsed_plus_list,
                 'match_api_id':match_id_list}
    
    # Create a dataframe
    cross = pd.DataFrame(cross_dic)
    
    # Change data types to be consistent with database
    cross.elapsed = pd.to_numeric(cross.elapsed, errors='coerce')
    cross.elapsed_plus = pd.to_numeric(cross.elapsed_plus, errors='coerce')
    cross.id = pd.to_numeric(cross.id, errors='coerce')
    cross.player1 = pd.to_numeric(cross.player1, errors='coerce')
    cross.team = pd.to_numeric(cross.team, errors='coerce')
    
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'cross.data')):
        print('Saving cross dataset to file...')
        try:
            with open(os.path.join(export_folder, 'cross.data'), 'wb') as f:
                pickle.dump(cross, f)
            print('cross dataset has been saved to cross.data')
        except Exception as e:
            print('Unable to save data to: cross.data')
    else:
        print('cross dataset is cached in file: cross.data.')

#### <a id='7'></a>7. Corner

In [10]:
# Turn this on if you want to parse corner information
parse_corner = False

if parse_corner:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    player1_list=[]
    team_list=[]
    elapsed_list=[]
    match_id_list=[]
    elapsed_plus_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['corner'].notnull()].shape[0]), desc='Extracting Corner Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['corner'].notnull()].iloc[i]['corner'],'lxml')
        
        # Loop over all events
        for element in soup.corner.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
            
            # Get player1
            try:
                player1_list.append(element.find('player1').text)
            except AttributeError:
                player1_list.append(np.nan)
            
            # Get team
            try:
                team_list.append(element.find('team').text)
            except AttributeError:
                team_list.append(np.nan)
            
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)
            
            # Get match id
            match_id_list.append(match[match['corner'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    corner_dic = {'id': id_list,                        
                  'type': type_list,                        
                  'subtype':subtype_list,                        
                  'player1': player1_list,             
                  'team': team_list,              
                  'elapsed': elapsed_list,             
                  'elapsed_plus':elapsed_plus_list,             
                  'match_api_id':match_id_list}
    
    # Create a dataframe
    corner = pd.DataFrame(corner_dic)
    
    # Change data types to be consistent with database
    corner.elapsed = pd.to_numeric(corner.elapsed, errors='coerce')
    corner.elapsed_plus = pd.to_numeric(corner.elapsed_plus, errors='coerce')
    corner.id = pd.to_numeric(corner.id, errors='coerce')
    corner.player1 = pd.to_numeric(corner.player1, errors='coerce')
    corner.team = pd.to_numeric(corner.team, errors='coerce')
    
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'corner.data')):
        print('Saving corner dataset to file...')
        try:
            with open(os.path.join(export_folder, 'corner.data'), 'wb') as f:
                pickle.dump(corner, f)
            print('corner dataset has been saved to corner.data')
        except Exception as e:
            print('Unable to save data to: corner.data')
    else:
        print('corner dataset is cached in file: corner.data.')

#### <a id='8'></a>8. Possession

In [11]:
# Turn this on if you want to parse possession information
parse_pos = False

if parse_pos:
    #Create empty lists to store extracted data
    id_list=[]
    type_list=[]
    subtype_list=[]
    elapsed_list=[]
    match_id_list=[]
    awaypos_list=[]
    homepos_list=[]
    elapsed_plus_list=[]
    
    # Loop over all available rows
    for i in tqdm(np.arange(match[match['possession'].notnull()].shape[0]), desc='Extracting Possession Events', unit='matches'):
        
        # Create a soup to parse
        soup = bs4.BeautifulSoup(match[match['possession'].notnull()].iloc[i]['possession'],'lxml')
        
        # Loop over all events
        for element in soup.possession.find_all('value',recursive=False):
            
            # Get event id
            try:
                id_list.append(element.find('id').text)
            except AttributeError:
                id_list.append(np.nan)
            
            # Get event type
            try:
                type_list.append(element.find('type').text)
            except AttributeError:
                type_list.append(np.nan)
            
            # Get event subtype
            try:
                subtype_list.append(element.find('subtype').text)
            except AttributeError:
                subtype_list.append(np.nan)
                        
            # Get elapsed time
            try:
                elapsed_list.append(element.find('elapsed').text)
            except AttributeError:
                elapsed_list.append(np.nan)
            
            # Get away team possession
            try:
                awaypos_list.append(element.find('awaypos').text)
            except AttributeError:
                awaypos_list.append(np.nan)
            
            # Get home team possession
            try:
                homepos_list.append(element.find('homepos').text)
            except AttributeError:
                homepos_list.append(np.nan)
            
            # Get elapsed plus_time
            try:
                elapsed_plus_list.append(element.find('elapsed_plus').text)
            except AttributeError:
                elapsed_plus_list.append(np.nan)        
            
            # Get match id
            match_id_list.append(match[match['possession'].notnull()].iloc[i]['match_api_id'])
    
    # Create a dictionary
    possession_dic = {'id': id_list,                        
                      'type': type_list,                        
                      'subtype':subtype_list,                        
                      'awaypos': awaypos_list,
                      'homepos': homepos_list,                         
                      'elapsed': elapsed_list,
                      'elapsed_plus':elapsed_plus_list,
                      'match_api_id':match_id_list}
    
    # Create a dataframe
    possession = pd.DataFrame(possession_dic)
    
    # Change data types to be consistent with database
    possession.elapsed = pd.to_numeric(possession.elapsed, errors='coerce')
    possession.elapsed_plus = pd.to_numeric(possession.elapsed_plus, errors='coerce')
    possession.id = pd.to_numeric(possession.id, errors='coerce')
    possession.awaypos = pd.to_numeric(possession.awaypos, errors='coerce')
    possession.homepos = pd.to_numeric(possession.homepos, errors='coerce')
    
    # Save to local disk
    # If data file has not been generated in the same folder yet
    if not os.path.isfile(os.path.join(export_folder, 'possession.data')):
        print('Saving corner dataset to file...')
        try:
            with open(os.path.join(export_folder, 'possession.data'), 'wb') as f:
                pickle.dump(possession, f)
            print('possession dataset has been saved to possession.data')
        except Exception as e:
            print('Unable to save data to: possession.data')
    else:
        print('possession dataset is cached in file: possession.data.')