## Data Importation 


In [58]:
#Importing python paackages 
import pandas as pd 
import numpy as np

In [60]:
#importing events dataset
prem = pd.read_json('/Users/yungreezy/events_England.json') 
ligue1 = pd.read_json('/Users/yungreezy/events_France.json')
bundesliga = pd.read_json('/Users/yungreezy/events_Germany.json')
seriea= pd.read_json('/Users/yungreezy/events_Italy.json')
laliga = pd.read_json('/Users/yungreezy/events_spain.json')

## Data Wrangling

In [62]:
#Merging Datasets from different leagues
all_leagues = pd.concat([prem, ligue1, bundesliga, seriea, laliga])

#Sorting data chronologically 
all_leagues.sort_values(by=['matchId','matchPeriod', 'eventSec'], inplace=True) 

#creating dummy fields 
all_leagues['PotentialChance'] = all_leagues['subEventName'].apply(lambda x: 1 if x == 'Shot' else 0).shift(-1) 
all_leagues['Interruption_dummy'] = all_leagues['eventName'].apply(lambda x: 1 if x in ['Interruption', 'Foul', 'Offside'] else 0) 

#creating nominal fields for tracking interruptions sequences and chances
all_leagues['Interruption'] = all_leagues['eventName'].apply(lambda x: 'interruption' if x in ['Interruption', 'Foul', 'Offside'] else 'seq') 
all_leagues.loc[all_leagues['PotentialChance']==1,'Interruption']='chancecreated' 

#seperating data by matches so event sequences don't spill between matches in the dataset
all_leagues1 = all_leagues.groupby('matchId') 
all_leagues1 = pd.concat([i.append({'value': np.nan}, ignore_index=True) for _, i in all_leagues1]) 
all_leagues1['Interruption_dummy'] = all_leagues1['Interruption_dummy'].fillna(1).shift(1) 
all_leagues1['Interruption'] = all_leagues1['Interruption'].fillna('opening whistle').shift(1)  

all_leagues1 = all_leagues1.groupby(['matchId','matchPeriod']) 
all_leagues1 = pd.concat([i.append({'value': np.nan}, ignore_index=True) for _, i in all_leagues1]) 
all_leagues1['Interruption'] = all_leagues1['Interruption'].fillna('halftime whistle')

all_leagues1['value'] = all_leagues1['Interruption'].apply(lambda x: 'final whistle' if x == 'opening whistle' else 'value').shift(-1) 
all_leagues1.loc[all_leagues1['value'] == 'final whistle', 'Interruption'] = 'final whistle' 
all_leagues1['Interruption_dummy'] = all_leagues1['Interruption_dummy'].fillna(1) 

all_leagues1.loc[0,'Interruption_dummy'] = 1 

all_leagues1.loc[0,'Interruption'] = 'opening whistle'
all_leagues1 = all_leagues1.drop(columns = 'value') 
all_leagues1 = all_leagues1.reset_index()



In [64]:
# extracting dataframe indexes corresponding to chances created/interruptions
chancecreated_index = all_leagues1[all_leagues1['Interruption'] == 'chancecreated'].index.tolist() 
interruptions_index = all_leagues1[all_leagues1['Interruption'] == 'interruption'].index.tolist() 

In [66]:
#creating a list of interruptions that immediately precede a chance created.
lazt_interruptions = []
for c in chancecreated_index:
    binlist = []
    for i in interruptions_index:
        diff = c - i
        if diff > 0 and i < c:
            binlist.append(i)
    lazt_interruptions.append(max(binlist)) 

In [68]:
#creating a list of (interruption, chance created) pairs for use in the new dataframe. 
sequences = list(zip(lazt_interruptions, chancecreated_index)) 

In [70]:
#attempting to generate a new event sequence dataframe.
final_df = pd.DataFrame()
for int_idx, cha_idx in sequences:
    temp_df = prem3.iloc[int_idx:cha_idx,:].copy()
    event_count = temp_df.subEventName.value_counts()
    for event in event_count.index.to_list():
        final_df.loc[int_idx,:] = event_count.loc[event]

In [72]:
interchance_list =[]

for i in sequences:
    interchance_list.append(prem3.iloc[i[0]:i[1],:].copy())

In [73]:
counts = []
for i in interchance_list: 
    counts.append(i['subEventName'].value_counts())

In [74]:
counts2 = []
for i in counts:
    counts2.append(i.reset_index())

In [75]:
counts4 = []
for i in counts:
    counts4.append(i.to_frame())

In [78]:
event_list = ['Air duel','Ground attacking duel','Ground defending duel','Ground loose ball duel','Foul','Hand foul','Late card foul','Out of game foul','Protest','Simulation','Time lost foul','Violent Foul','Corner','Free Kick','Free kick cross','Free kick shot','Goal kick','Penalty','Throw in','Goalkeeper leaving line','Ball out of the field','Whistle','Offside','Acceleration','Clearance','Touch','Cross','Hand pass','Head pass','High pass','Launch','Simple pass','Smart pass','Reflexes','Save attempt','Shot']
simplepass_count = []
for i in counts4:
    if 'Simple pass' in i.index:
        simplepass_count.append(i.loc['Simple pass', 'subEventName'])
    else: 
        simplepass_count.append(0)

airduel_count = []
for i in counts4:
    if 'Air duel' in i.index:
        airduel_count.append(i.loc['Air duel', 'subEventName'])
    else: 
        airduel_count.append(0)

groundattack_count = []
for i in counts4:
    if 'Ground attacking duel' in i.index:
        groundattack_count.append(i.loc['Ground attacking duel', 'subEventName'])
    else: 
        groundattack_count.append(0)

grounddefend_count = []
for i in counts4:
    if 'Ground defending duel' in i.index:
        grounddefend_count.append(i.loc['Ground defending duel', 'subEventName'])
    else: 
        grounddefend_count.append(0)
        
groundloose_count = []
for i in counts4:
    if 'Ground loose ball duel' in i.index:
        groundloose_count.append(i.loc['Ground loose ball duel', 'subEventName'])
    else: 
        groundloose_count.append(0)
        
foul_count = []
for i in counts4:
    if 'Foul' in i.index:
        foul_count.append(i.loc['Foul', 'subEventName'])
    else: 
        foul_count.append(0)

outofgamefoul_count = []
for i in counts4:
    if 'Out of game foul' in i.index:
        outofgamefoul_count.append(i.loc['Out of game foul', 'subEventName'])
    else: 
        outofgamefoul_count.append(0)
        
handfoul_count = []
for i in counts4:
    if 'Hand foul' in i.index:
        handfoul_count.append(i.loc['Hand foul', 'subEventName'])
    else: 
        handfoul_count.append(0)

latecardfoul_count = []
for i in counts4:
    if 'Late card foul' in i.index:
        latecardfoul_count.append(i.loc['Late card foul', 'subEventName'])
    else: 
        latecardfoul_count.append(0)
    
protest_count = []
for i in counts4:
    if 'Protest' in i.index:
        protest_count.append(i.loc['Protest', 'subEventName'])
    else: 
        protest_count.append(0)

simulation_count = []
for i in counts4:
    if 'Simulation' in i.index:
        simulation_count.append(i.loc['Simulation', 'subEventName'])
    else: 
        simulation_count.append(0)
        
timelostfoul_count = []
for i in counts4:
    if 'Time lost foul' in i.index:
        timelostfoul_count.append(i.loc['Time lost foul', 'subEventName'])
    else: 
        timelostfoul_count.append(0)
        
violentfoul_count = []
for i in counts4:
    if 'Violent Foul' in i.index:
        timelostfoul_count.append(i.loc['Violent Foul', 'subEventName'])
    else: 
        timelostfoul_count.append(0)
        
corner_count = []
for i in counts4:
    if 'Corner' in i.index:
        corner_count.append(i.loc['Corner', 'subEventName'])
    else: 
        corner_count.append(0)

freekick_count = []
for i in counts4:
    if 'Free Kick' in i.index:
        freekick_count.append(i.loc['Free Kick', 'subEventName'])
    else: 
        freekick_count.append(0)

freekickcross_count = []
for i in counts4:
    if 'Free kick cross' in i.index:
        freekickcross_count.append(i.loc['Free kick cross', 'subEventName'])
    else: 
        freekickcross_count.append(0)
        
freekickshot_count = []
for i in counts4:
    if 'Free kick shot' in i.index:
        freekickshot_count.append(i.loc['Free kick shot', 'subEventName'])
    else: 
        freekickshot_count.append(0)

goalkick_count = []
for i in counts4:
    if 'Goal kick' in i.index:
        goalkick_count.append(i.loc['Goal kick', 'subEventName'])
    else: 
        goalkick_count.append(0)

penalty_count = []
for i in counts4:
    if 'Penalty' in i.index:
        penalty_count.append(i.loc['Penalty', 'subEventName'])
    else: 
        penalty_count.append(0)
        
throwin_count = []
for i in counts4:
    if 'Throw in' in i.index:
        throwin_count.append(i.loc['Throw in', 'subEventName'])
    else: 
        throwin_count.append(0)

keeperline_count = []
for i in counts4:
    if 'Goalkeeper leaving line' in i.index:
        keeperline_count.append(i.loc['Goalkeeper leaving line', 'subEventName'])
    else: 
        keeperline_count.append(0)
        
outoffield_count = []
for i in counts4:
    if 'Ball out of the field' in i.index:
        outoffield_count.append(i.loc['Ball out of the field', 'subEventName'])
    else: 
        outoffield_count.append(0)
        
whistle_count = []
for i in counts4:
    if 'Whistle' in i.index:
        whistle_count.append(i.loc['Whistle', 'subEventName'])
    else: 
        whistle_count.append(0)

offside_count = []
for i in counts4:
    if 'Offside' in i.index:
        offside_count.append(i.loc['Offside', 'subEventName'])
    else: 
        offside_count.append(0)
        
acceleration_count = []
for i in counts4:
    if 'Acceleration' in i.index:
        acceleration_count.append(i.loc['Acceleration', 'subEventName'])
    else: 
        acceleration_count.append(0)
        
clearance_count = []
for i in counts4:
    if 'Clearance' in i.index:
        clearance_count.append(i.loc['Clearance', 'subEventName'])
    else: 
        clearance_count.append(0)
        
touch_count = []
for i in counts4:
    if 'Touch' in i.index:
        touch_count.append(i.loc['Touch', 'subEventName'])
    else: 
        touch_count.append(0)
        
cross_count = []
for i in counts4:
    if 'Cross' in i.index:
        cross_count.append(i.loc['Cross', 'subEventName'])
    else: 
        cross_count.append(0)

handpass_count = []
for i in counts4:
    if 'Hand pass' in i.index:
        handpass_count.append(i.loc['Hand pass', 'subEventName'])
    else: 
        handpass_count.append(0)

headpass_count = []
for i in counts4:
    if 'Head pass' in i.index:
        headpass_count.append(i.loc['Head pass', 'subEventName'])
    else: 
        headpass_count.append(0)

highpass_count = []
for i in counts4:
    if 'High pass' in i.index:
        highpass_count.append(i.loc['High pass', 'subEventName'])
    else: 
        highpass_count.append(0)

launch_count = []
for i in counts4:
    if 'Launch' in i.index:
        launch_count.append(i.loc['Launch', 'subEventName'])
    else: 
        launch_count.append(0)
    
smartpass_count = []
for i in counts4:
    if 'Smart pass' in i.index:
        smartpass_count.append(i.loc['Smart pass', 'subEventName'])
    else: 
        smartpass_count.append(0)

reflexes_count = []
for i in counts4:
    if 'Reflexes' in i.index:
        reflexes_count.append(i.loc['Reflexes', 'subEventName'])
    else: 
        reflexes_count.append(0)

saveattempt_count = []
for i in counts4:
    if 'Save attempt' in i.index:
        saveattempt_count.append(i.loc['Save attempt', 'subEventName'])
    else: 
        saveattempt_count.append(0)
        
shot_count = []
for i in counts4:
    if 'Shot' in i.index:
        shot_count.append(i.loc['Shot', 'subEventName'])
    else: 
        shot_count.append(0)

In [136]:
d = {'Simple Pass': simplepass_count,
     'Air Duel': airduel_count,
     'G A Duel': groundattack_count,
     'G D Duel': grounddefend_count,
     'G L Duel': groundloose_count,
     'Foul': foul_count,
     'Out Foul': outofgamefoul_count,
     'Hand Foul': handfoul_count,
     'Late Foul': latecardfoul_count,
     'Protest': protest_count,
     'Simulation': simulation_count,
     'Corner Count': corner_count,
     'Free Kick': freekick_count,
     'Free Kick Cross': freekickcross_count,
     'Free Kick Shot': freekickshot_count,
     'Goal Kick': goalkick_count,
     'Penalty': penalty_count,
     'Throw In': throwin_count,
     'Keeper Off Line': keeperline_count,
     'Out Of Field': outoffield_count,
     'Whistle': whistle_count,
     'Offside': offside_count,
     'Acceleration': acceleration_count,
     'Clearance': clearance_count,
     'Touch': touch_count,
     'Cross': cross_count,
     'Hand Pass': handpass_count,
     'Head Pass': headpass_count,
     'High Pass': highpass_count,
     'Launch': launch_count,
     'Smart Pass': smartpass_count,
     'Reflexes': reflexes_count,
     'Save Attempt': saveattempt_count,
     'Shot': shot_count}

sequences_df = pd.DataFrame(data=d)

In [143]:
len(sequences_df)

40458

In [145]:
all_leagues.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,PotentialChance,Interruption_dummy,Interruption
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171,0.0,0,seq
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.94685,83,177959172,0.0,0,seq
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173,0.0,0,seq
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174,0.0,0,seq
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175,0.0,0,seq
