In [1]:
import pandas as pd
import numpy as np

In [2]:
# games_data stores game information
# 10112 rows × 18 columns
# Columns: 'id_odsp', 'link_odsp', 'adv_stats', 'date', 'league', 'season', 'country', 'ht', 'at', 'fthg', 'ftag', 'odd_h', 'odd_d', 'odd_a', 'odd_over', 'odd_under', 'odd_bts', 'odd_bts_n'
games_data = pd.read_csv("ginf.csv")

### Games Data
1. id_odsp: Primary key
2. adv_stats: boolean if the game has detailed event data
3. fthg: full time home goals
4. ftag: full time away goals
5. odd_h, odd_d, odd_a: highest odds for home win, draw, away win

90% missing for columns: odd_over, odd_under, odd_bts, odd_bts_n

In [3]:
# game_events stores events data
# 941009 rows × 22 columns
game_events = pd.read_csv("events.csv")

### Events Data
1. id_odsp: 
2. id_event: primary key
3. sort_order: chronological sequence of events in a game
4. time: minute of the game
5. text: text commentary
6. event_type: primary event. 11 unique events (1-Attempt(shot), 2-Corner, 3-Foul, 4-Yellow Card, 5-Second yellow card, 6-(Straight) red card, 7-Substitution, 8-Free kick won, 9-Offside, 10-Hand Ball, 11-Penalty conceded)
7. event_type2: secondary event. 4 unique events (12 - Key Pass, 13 - Failed through ball, 14-Sending off, 15-Own goal)
8. side: 1-Home, 2-Away
9. event_team: team that produced the event. In case of Own goals, event team is the team that beneficiated from the own goal
10. opponent: team that the event happened against
11. player: name of the player involved in main event (converted to lowercase and special chars were removed)
12. player2: name of player involved in secondary event
13. player_in: player that came in (only applies to substitutions)
14. player_out: player substituted (only applies to substitutions)
15. shot_place: placement of the shot (13 possible placement locations, available in the dictionary, only applies to shots)
16. shot_outcome: 4 possible outcomes (1-On target, 2-Off target, 3-Blocked, 4-Hit the post)
17. is_goal: binary variable if the shot resulted in a goal (own goals included)
18. location: location on the pitch where the event happened (19 possible locations, available in the dictionary)
19. bodypart: (1- right foot, 2-left foot, 3-head)
20. assist_method: in case of an assisted shot, 5 possible assist methods (details in the dictionary)
21. situation: 4 types: 1-Open Play, 2-Set piece (excluding Direct Free kicks), 3-Corner, 4-Free kick
22. fast_break: binary

In [None]:
a = game_events.groupby(['id_odsp'])['sort_order'].max().reset_index()
a['sort_order'].describe()
#decided to use 100 events for this project since average event count as 104 and 75% of the games have less than 115 events 

In [4]:
#add class column to games_data based on scores
#shuffle the data before spliting into different sets
games_data = games_data[games_data['adv_stats'] == True]
games_data_sf = games_data.sample(frac=1)
games_data_sf.drop(games_data_sf.columns[-4:], axis=1, inplace=True)
def f(x):
    if x['fthg'] > x['ftag']: return 1
    elif x['fthg'] < x['ftag']: return -1
    else: return 0
games_data_sf['class'] = games_data_sf.apply(f, axis=1)

# 9182 rows left after basic filtering
# 1836 for dev, 6427 for cross validation, 919 for final test

In [5]:
# Target: Game data - games_data_sf
#Convert nominal column into columns of binary variables
#Columns: league, country
#Referenced from https://stackoverflow.com/questions/42151359/pandas-convert-text-to-binary-columns
games_data_sf = pd.concat([games_data_sf, pd.get_dummies(games_data_sf.league).rename(columns = "{}_league_binary".format)], axis = 1)
games_data_sf = pd.concat([games_data_sf, pd.get_dummies(games_data_sf.country).rename(columns = "{}_country_binary".format)], axis = 1)

In [6]:
# Target: Game data - games_data_sf
# Drop link_odsp, date, adv_stats, league, ht, at, country columns
games_data_sf.drop(["link_odsp","date","adv_stats","league","ht","at","country"], axis=1, inplace=True)

In [7]:
# Target: Game data - dev_data
# getting dev set data 
dev_data = games_data_sf[:1836].copy()
dev_data.set_index('id_odsp',inplace = True)

In [8]:
# Target: Event data - game_events_m
# Replace column value with dictionary, reference: https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict-preserve-nans 
# Convert event_type, event_type2, side, 
game_events_m = game_events.copy()
event_type1 = {1:"Attempt(shot)", 2:"Corner", 3:"Foul", 4:"YellowCard", 5:"SecondYellowCard", 6:"StraightRedCard", 7:"Substitution", 8:"FreeKickWon", 9:"Offside", 10:"HandBall", 11:"PenaltyConceded"}
event_type2 = {12:"KeyPass", 13:"FailedThroughBall", 14:"SendingOff", 15:"OwnGoal"}
shot_outcome = {1:"OnTarget", 2:"OffTarget", 3:"Blocked", 4:"HitThePost"}
assist_method = {0:"None",1:"Pass",2:"Cross", 3: "HeadedPass", 4: "ThroughBall"}
situation = {1:"OpenPlay",2:"SetPiece",3:"Corner",4:"FreeKick"}
game_events_m.replace({"event_type": event_type1,"event_type2":event_type2,"side":{2:0,1:1}, "shot_outcome": shot_outcome,"assist_method":assist_method, "situation":situation}, inplace = True)
game_events_m.rename(columns={"side": "home_event"}, inplace = True)

In [9]:
# Target: Event data - game_events_m
#Convert nominal column into columns of binary variables
#Columns: league, country
game_events_m = pd.concat([game_events_m, pd.get_dummies(game_events_m.event_type).rename(columns = "{}_EventType_binary".format)], axis = 1)
game_events_m = pd.concat([game_events_m, pd.get_dummies(game_events_m.event_type2).rename(columns = "{}_EventType2_binary".format)], axis = 1)
game_events_m = pd.concat([game_events_m, pd.get_dummies(game_events_m.shot_outcome).rename(columns = "{}_ShotOutcome_binary".format)], axis = 1)
game_events_m = pd.concat([game_events_m, pd.get_dummies(game_events_m.assist_method).rename(columns = "{}_AssistMethod_binary".format)], axis = 1)
game_events_m = pd.concat([game_events_m, pd.get_dummies(game_events_m.situation).rename(columns = "{}_Situation_binary".format)], axis = 1)

In [10]:
# Drop columns: "text","event_team","opponent","is_goal","player_in","player_out", "shot_place","location","bodypart", "player","player2"
game_events_m.drop(["text","event_team","opponent","is_goal","player_in","player_out", "shot_place","location","bodypart", "player","player2","event_type","event_type2","shot_outcome","assist_method","situation"], axis=1, inplace=True)

In [22]:
# Target: Event data - game_events_m
#Convert nominal column into columns of binary variables
#Columns: league, country
event_columns = game_events_m.columns.tolist()
event_columns.remove('id_odsp')
event_columns.remove('id_event')
event_columns.remove('sort_order')

In [19]:
# Target: Dev Event data - dev_event_dt
# Get game ids from dev_data, keep only first 100 events in dev event data, keep only dev games ID in dev_event_dt
dev_games = dev_data.index.tolist()
dev_event_dt = game_events_m[game_events_m['sort_order'] < 101]
# dev_event_dt = dev_event_dt[dev_event_dt.columns.intersection(event_columns)]
dev_event_dt=dev_event_dt[dev_event_dt['id_odsp'].isin(dev_games)]

In [23]:
# Target: Dev Event data - dev_event_dt
# Get game ids from dev_data, keep only first 100 events in dev event data, keep only dev games ID in dev_event_dt
for i in range(1,101):
    for column in event_columns:
        dev_data[str(i)+"-"+column] = ""
for index, row in dev_event_dt.iterrows(): 
    order = str(row['sort_order'])
    event_id = row['id_odsp']
    for column1 in event_columns:
        column_name = order+"-"+column1
        if str(row[column1]) != "":
            dev_data.at[event_id, column_name] = row[column1]

  dev_data[str(i)+"-"+column] = ""


In [24]:
dev_data.columns

Index(['season', 'fthg', 'ftag', 'odd_h', 'odd_d', 'odd_a', 'class',
       'D1_league_binary', 'E0_league_binary', 'F1_league_binary',
       ...
       '100-OnTarget_ShotOutcome_binary', '100-Cross_AssistMethod_binary',
       '100-HeadedPass_AssistMethod_binary', '100-None_AssistMethod_binary',
       '100-Pass_AssistMethod_binary', '100-ThroughBall_AssistMethod_binary',
       '100-Corner_Situation_binary', '100-FreeKick_Situation_binary',
       '100-OpenPlay_Situation_binary', '100-SetPiece_Situation_binary'],
      dtype='object', length=3117)

In [None]:
# dev_data = dev_data[dev_data.columns.drop(list(dev_data.filter(regex='id')))]

In [25]:
dev_data.to_csv("dev_data_binarylist.csv", encoding = 'utf-8')

In [None]:
len(dev_data.columns.tolist())