In [1]:
import pandas as pd
import numpy as np

In [2]:
# games_data stores game information
# 10112 rows × 18 columns
# Columns: 'id_odsp', 'link_odsp', 'adv_stats', 'date', 'league', 'season', 'country', 'ht', 'at', 'fthg', 'ftag', 'odd_h', 'odd_d', 'odd_a', 'odd_over', 'odd_under', 'odd_bts', 'odd_bts_n'
games_data = pd.read_csv("ginf.csv")

### Games Data
1. id_odsp: Primary key
2. adv_stats: boolean if the game has detailed event data
3. fthg: full time home goals
4. ftag: full time away goals
5. odd_h, odd_d, odd_a: highest odds for home win, draw, away win

90% missing for columns: odd_over, odd_under, odd_bts, odd_bts_n

In [3]:
# game_events stores events data
# 941009 rows × 22 columns
game_events = pd.read_csv("events.csv")

### Events Data
1. id_odsp: 
2. id_event: primary key
3. sort_order: chronological sequence of events in a game
4. time: minute of the game
5. text: text commentary
6. event_type: primary event. 11 unique events (1-Attempt(shot), 2-Corner, 3-Foul, 4-Yellow Card, 5-Second yellow card, 6-(Straight) red card, 7-Substitution, 8-Free kick won, 9-Offside, 10-Hand Ball, 11-Penalty conceded)
7. event_type2: secondary event. 4 unique events (12 - Key Pass, 13 - Failed through ball, 14-Sending off, 15-Own goal)
8. side: 1-Home, 2-Away
9. event_team: team that produced the event. In case of Own goals, event team is the team that beneficiated from the own goal
10. opponent: team that the event happened against
11. player: name of the player involved in main event (converted to lowercase and special chars were removed)
12. player2: name of player involved in secondary event
13. player_in: player that came in (only applies to substitutions)
14. player_out: player substituted (only applies to substitutions)
15. shot_place: placement of the shot (13 possible placement locations, available in the dictionary, only applies to shots)
16. shot_outcome: 4 possible outcomes (1-On target, 2-Off target, 3-Blocked, 4-Hit the post)
17. is_goal: binary variable if the shot resulted in a goal (own goals included)
18. location: location on the pitch where the event happened (19 possible locations, available in the dictionary)
19. bodypart: (1- right foot, 2-left foot, 3-head)
20. assist_method: in case of an assisted shot, 5 possible assist methods (details in the dictionary)
21. situation: 4 types: 1-Open Play, 2-Set piece (excluding Direct Free kicks), 3-Corner, 4-Free kick
22. fast_break: binary

In [None]:
a = game_events.groupby(['id_odsp'])['sort_order'].max().reset_index()
a['sort_order'].describe()
#decided to use 100 events for this project since average event count as 104 and 75% of the games have less than 115 events 

In [4]:
games_data = games_data[games_data['adv_stats'] == True]
games_data_sf = games_data.sample(frac=1)
games_data_sf.drop(games_data_sf.columns[-4:], axis=1, inplace=True)
def f(x):
    if x['fthg'] > x['ftag']: return 1
    elif x['fthg'] < x['ftag']: return -1
    else: return 0
games_data_sf['class'] = games_data_sf.apply(f, axis=1)

#9182 rows left after basic filtering
# 1836 for dev, 6427 for cross validation, 919 for final test

In [5]:
dev_data = games_data_sf[:1836].copy()
dev_data.set_index('id_odsp',inplace = True)

In [7]:
#keep certain columns in event
event_columns = game_events.columns.tolist()
event_columns.remove('event_team')
event_columns.remove('opponent')
event_columns.remove('is_goal')
event_columns.remove('player')
event_columns.remove('player2')
event_columns.remove('player_in')
event_columns.remove('player_out')
event_columns.remove('text')

In [10]:
dev_games = dev_data.index.tolist()

In [12]:
dev_event_dt = game_events[game_events['sort_order'] < 101]
dev_event_dt = dev_event_dt[dev_event_dt.columns.intersection(event_columns)]
dev_event_dt=dev_event_dt[dev_event_dt['id_odsp'].isin(dev_games)]

In [13]:
for i in range(1,101):
    for column in event_columns:
        dev_data[str(i)+"-"+column] = ""
for index, row in dev_event_dt.iterrows(): 
    order = str(row['sort_order'])
    event_id = row['id_odsp']
    for column1 in event_columns:
        column_name = order+"-"+column1
        if str(row[column1]) != "":
            dev_data.at[event_id, column_name] = row[column1]

  dev_data[str(i)+"-"+column] = ""


In [17]:
dev_data = dev_data[dev_data.columns.drop(list(dev_data.filter(regex='id')))]

In [19]:
dev_data.to_csv("dev_data.csv", encoding = 'utf-8')

In [18]:
dev_data.columns.tolist()

['link_odsp',
 'adv_stats',
 'date',
 'league',
 'season',
 'country',
 'ht',
 'at',
 'fthg',
 'ftag',
 'odd_h',
 'odd_d',
 'odd_a',
 'class',
 '1-sort_order',
 '1-time',
 '1-event_type',
 '1-event_type2',
 '1-shot_place',
 '1-shot_outcome',
 '1-location',
 '1-bodypart',
 '1-assist_method',
 '1-situation',
 '1-fast_break',
 '2-sort_order',
 '2-time',
 '2-event_type',
 '2-event_type2',
 '2-shot_place',
 '2-shot_outcome',
 '2-location',
 '2-bodypart',
 '2-assist_method',
 '2-situation',
 '2-fast_break',
 '3-sort_order',
 '3-time',
 '3-event_type',
 '3-event_type2',
 '3-shot_place',
 '3-shot_outcome',
 '3-location',
 '3-bodypart',
 '3-assist_method',
 '3-situation',
 '3-fast_break',
 '4-sort_order',
 '4-time',
 '4-event_type',
 '4-event_type2',
 '4-shot_place',
 '4-shot_outcome',
 '4-location',
 '4-bodypart',
 '4-assist_method',
 '4-situation',
 '4-fast_break',
 '5-sort_order',
 '5-time',
 '5-event_type',
 '5-event_type2',
 '5-shot_place',
 '5-shot_outcome',
 '5-location',
 '5-bodypart',


In [None]:
game_events