In [3]:
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

import numpy as np
import pandas as pd
import sqlite3 as sq
import time
import math
import re
from sklearn import preprocessing

In [4]:
#just reading data
con = sq.connect("database.sqlite")
team_atts = pd.read_sql_query("SELECT * from Team_Attributes", con)
teams = pd.read_sql_query("SELECT * from Team", con)
matches = pd.read_sql_query("SELECT * from Match", con)
matches = matches[['date', 'home_team_goal', 'away_team_goal', 'home_team_api_id', 'away_team_api_id', 
                  'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession',
                  'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD',
                  'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD',
                  'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA']]

In [5]:
# print(team_atts.columns.values)

### Tasks TODO: 
### Drop columns that are redundant
### Drop rows that are missing too many values
### Replace text data such as "slow, medium, fast" with number mappings that make sense (e.g. 0, 1, 2) 
### Normalize all features to be in the range (0, 1): subtract the low value and divide by (high - low)

buildUpPlaySpeed = {'Slow': 0, 'Balanced': 1, 'Fast': 2}

st = time.time()

denom = len(matches.columns.values)


### Add blank columns for team attributes to be filled in for each match
for column in list(team_atts.columns.values):
    matches['__home_' + column] = np.nan
    
for column in list(team_atts.columns.values):
    matches['__away_' + column] = np.nan

### To assist in filling values later (note the underscores leading __underscoes added above & used here 
### so we don't collide with existing column names)
home_column_indexes = [matches.columns.get_loc('__home_' + col_name) for col_name in team_atts.columns.values]
away_column_indexes = [matches.columns.get_loc('__away_' + col_name) for col_name in team_atts.columns.values]
indexes_to_drop = []

## Part of experiments described below
n_15_none_match = 0
n_15_none_team_att = 0

for index, match in matches.iterrows():
    ### For each match, we find the home and away team for the correct year, and add their data to the 
    ### dataframe
    year = match['date'][:4]
    home_team_id = match['home_team_api_id']
    away_team_id = match['away_team_api_id']
    home_team_atts = team_atts.loc[team_atts['team_api_id'] == home_team_id]
    away_team_atts = team_atts.loc[team_atts['team_api_id'] == away_team_id]
    home_team_att = home_team_atts.loc[team_atts['date'].str.contains(year)]
    away_team_att = away_team_atts.loc[team_atts['date'].str.contains(year)]
    
    
    ### This is just an experiment to determine a threshold for how many values should be 'None'
    ### in match data in order for us to drop a row. To drop a row, add its index to "indexes_to_drop"
    ### if too many values are 'None'
    pct_match_none = sum(1 for val in match.values if val is None) / denom
    if pct_match_none > 0.15:
        n_15_none_match += 1

    if not home_team_att.empty and not away_team_att.empty:
        matches.iloc[index, home_column_indexes] = home_team_att.values[0]
        matches.iloc[index, away_column_indexes] = away_team_att.values[0]
        
        
        ### This is just an experiment to determine a threshold for how many values should be 'None'
        ### in team attribute data in order for us to drop a row. To drop a row, add its index to "indexes_to_drop"
        ### if too many values are 'None'
        pct_home_none = sum(1 for val in home_team_att.values[0] if val is None) / len(home_team_att.values)
        pct_away_none = sum(1 for val in away_team_att.values[0] if val is None) / len(away_team_att.values)
        if pct_home_none > 0.15 or pct_away_none > 0.3:
            n_15_none_team_att += 1
        
    else:
        indexes_to_drop.append(index)

### Part of our experiments
n_rows = index
print('total input rows:', n_rows)
print('num lacking any team attribute data:', len(indexes_to_drop))
print('num where >15% of team data is None:', n_15_none_match)
print('num where >15% of team attribute data is None:', n_15_none_team_att)

matches = matches.drop(indexes_to_drop, axis=0) ### Drops rows that lack too much data

print('Took {0:.2f} seconds.'.format(time.time() - st))
matches

total input rows: 25978
num lacking any team attribute data: 7736
num where >15% of team data is None: 11762
num where >15% of team attribute data is None: 0
Took 433.88 seconds.


Unnamed: 0,date,home_team_goal,away_team_goal,home_team_api_id,away_team_api_id,goal,shoton,shotoff,foulcommit,card,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
376,2010-02-03 00:00:00,3,2,8635,8342,,,,,,...,45.0,Normal,Organised,60.0,Medium,70.0,Double,70.0,Wide,Cover
378,2010-02-04 00:00:00,2,3,9986,9985,,,,,,...,65.0,Normal,Organised,70.0,High,70.0,Double,65.0,Normal,Cover
381,2010-02-02 00:00:00,1,0,8203,9993,,,,,,...,50.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
404,2010-01-30 00:00:00,2,1,8342,10001,,,,,,...,55.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
405,2010-01-17 00:00:00,0,4,9985,8635,,,,,,...,60.0,Normal,Organised,70.0,High,50.0,Press,70.0,Wide,Cover
406,2010-01-16 00:00:00,2,1,9997,9994,,,,,,...,50.0,Normal,Organised,65.0,Medium,65.0,Press,70.0,Wide,Cover
407,2010-01-16 00:00:00,1,3,9986,9987,,,,,,...,60.0,Normal,Organised,70.0,High,65.0,Press,70.0,Wide,Cover
408,2010-01-15 00:00:00,2,2,9991,8571,,,,,,...,50.0,Normal,Organised,60.0,Medium,60.0,Press,65.0,Normal,Cover
409,2010-01-16 00:00:00,1,2,9999,8203,,,,,,...,50.0,Normal,Organised,60.0,Medium,70.0,Double,60.0,Normal,Cover
410,2010-01-17 00:00:00,1,4,9993,9984,,,,,,...,55.0,Normal,Organised,65.0,Medium,60.0,Press,70.0,Wide,Cover


In [6]:
matches.to_csv('data_step_1.csv')

In [7]:
matches

Unnamed: 0,date,home_team_goal,away_team_goal,home_team_api_id,away_team_api_id,goal,shoton,shotoff,foulcommit,card,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
376,2010-02-03 00:00:00,3,2,8635,8342,,,,,,...,45.0,Normal,Organised,60.0,Medium,70.0,Double,70.0,Wide,Cover
378,2010-02-04 00:00:00,2,3,9986,9985,,,,,,...,65.0,Normal,Organised,70.0,High,70.0,Double,65.0,Normal,Cover
381,2010-02-02 00:00:00,1,0,8203,9993,,,,,,...,50.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
404,2010-01-30 00:00:00,2,1,8342,10001,,,,,,...,55.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
405,2010-01-17 00:00:00,0,4,9985,8635,,,,,,...,60.0,Normal,Organised,70.0,High,50.0,Press,70.0,Wide,Cover
406,2010-01-16 00:00:00,2,1,9997,9994,,,,,,...,50.0,Normal,Organised,65.0,Medium,65.0,Press,70.0,Wide,Cover
407,2010-01-16 00:00:00,1,3,9986,9987,,,,,,...,60.0,Normal,Organised,70.0,High,65.0,Press,70.0,Wide,Cover
408,2010-01-15 00:00:00,2,2,9991,8571,,,,,,...,50.0,Normal,Organised,60.0,Medium,60.0,Press,65.0,Normal,Cover
409,2010-01-16 00:00:00,1,2,9999,8203,,,,,,...,50.0,Normal,Organised,60.0,Medium,70.0,Double,60.0,Normal,Cover
410,2010-01-17 00:00:00,1,4,9993,9984,,,,,,...,55.0,Normal,Organised,65.0,Medium,60.0,Press,70.0,Wide,Cover


In [8]:
print(matches.columns.values, len(matches.columns.values))

#drop first 13 of matches:
matches = matches.drop(['date', 'home_team_goal', 'away_team_goal' ,'home_team_api_id',
 'away_team_api_id', 'goal', 'shoton' ,'shotoff', 'foulcommit', 'card', 'cross',
 'corner', 'possession'], axis=1)


['date' 'home_team_goal' 'away_team_goal' 'home_team_api_id'
 'away_team_api_id' 'goal' 'shoton' 'shotoff' 'foulcommit' 'card' 'cross'
 'corner' 'possession' 'B365H' 'B365D' 'B365A' 'BWH' 'BWD' 'BWA' 'IWH'
 'IWD' 'IWA' 'LBH' 'LBD' 'LBA' 'PSH' 'PSD' 'PSA' 'WHH' 'WHD' 'WHA' 'SJH'
 'SJD' 'SJA' 'VCH' 'VCD' 'VCA' 'GBH' 'GBD' 'GBA' 'BSH' 'BSD' 'BSA'
 '__home_id' '__home_team_fifa_api_id' '__home_team_api_id' '__home_date'
 '__home_buildUpPlaySpeed' '__home_buildUpPlaySpeedClass'
 '__home_buildUpPlayDribbling' '__home_buildUpPlayDribblingClass'
 '__home_buildUpPlayPassing' '__home_buildUpPlayPassingClass'
 '__home_buildUpPlayPositioningClass' '__home_chanceCreationPassing'
 '__home_chanceCreationPassingClass' '__home_chanceCreationCrossing'
 '__home_chanceCreationCrossingClass' '__home_chanceCreationShooting'
 '__home_chanceCreationShootingClass'
 '__home_chanceCreationPositioningClass' '__home_defencePressure'
 '__home_defencePressureClass' '__home_defenceAggression'
 '__home_defenceAggressi

In [9]:
print(matches.columns.values, len(matches.columns.values))
print(matches)

            

['B365H' 'B365D' 'B365A' 'BWH' 'BWD' 'BWA' 'IWH' 'IWD' 'IWA' 'LBH' 'LBD'
 'LBA' 'PSH' 'PSD' 'PSA' 'WHH' 'WHD' 'WHA' 'SJH' 'SJD' 'SJA' 'VCH' 'VCD'
 'VCA' 'GBH' 'GBD' 'GBA' 'BSH' 'BSD' 'BSA' '__home_id'
 '__home_team_fifa_api_id' '__home_team_api_id' '__home_date'
 '__home_buildUpPlaySpeed' '__home_buildUpPlaySpeedClass'
 '__home_buildUpPlayDribbling' '__home_buildUpPlayDribblingClass'
 '__home_buildUpPlayPassing' '__home_buildUpPlayPassingClass'
 '__home_buildUpPlayPositioningClass' '__home_chanceCreationPassing'
 '__home_chanceCreationPassingClass' '__home_chanceCreationCrossing'
 '__home_chanceCreationCrossingClass' '__home_chanceCreationShooting'
 '__home_chanceCreationShootingClass'
 '__home_chanceCreationPositioningClass' '__home_defencePressure'
 '__home_defencePressureClass' '__home_defenceAggression'
 '__home_defenceAggressionClass' '__home_defenceTeamWidth'
 '__home_defenceTeamWidthClass' '__home_defenceDefenderLineClass'
 '__away_id' '__away_team_fifa_api_id' '__away_team_api_

In [10]:
matches

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
376,1.73,3.60,4.75,1.75,3.40,4.25,1.75,3.3,3.80,1.66,...,45.0,Normal,Organised,60.0,Medium,70.0,Double,70.0,Wide,Cover
378,3.50,3.30,2.10,3.45,3.20,2.00,3.40,3.2,1.90,3.20,...,65.0,Normal,Organised,70.0,High,70.0,Double,65.0,Normal,Cover
381,2.40,3.25,2.90,2.35,3.15,2.80,2.20,3.1,2.80,2.30,...,50.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
404,1.44,4.50,7.00,1.35,4.40,7.50,1.40,3.9,6.00,1.33,...,55.0,Normal,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
405,3.00,3.25,2.38,2.75,3.10,2.40,2.45,3.1,2.45,2.60,...,60.0,Normal,Organised,70.0,High,50.0,Press,70.0,Wide,Cover
406,2.15,3.25,3.40,1.95,3.30,3.30,2.10,3.2,2.90,2.00,...,50.0,Normal,Organised,65.0,Medium,65.0,Press,70.0,Wide,Cover
407,2.80,3.25,2.50,2.40,3.30,2.65,2.40,3.2,2.40,2.50,...,60.0,Normal,Organised,70.0,High,65.0,Press,70.0,Wide,Cover
408,1.50,4.00,7.00,1.40,3.90,7.50,1.50,3.7,5.00,1.40,...,50.0,Normal,Organised,60.0,Medium,60.0,Press,65.0,Normal,Cover
409,2.80,3.25,2.50,2.80,3.15,2.35,2.40,3.2,2.40,2.50,...,50.0,Normal,Organised,60.0,Medium,70.0,Double,60.0,Normal,Cover
410,1.70,3.60,5.00,1.65,3.40,4.90,1.70,3.4,3.80,1.66,...,55.0,Normal,Organised,65.0,Medium,60.0,Press,70.0,Wide,Cover


In [11]:
#Enumerate the columns if they have string values
newCol = {}
for col in matches.columns.values:
    if re.search('Class', col):
            #print(col, matches[col])
            #enum_dict = dict(enumerate(list(set(matches[col]))))
            enum_dict = { k: v for v, k in dict(enumerate(list(set(matches[col])))).items()}
            #print(col, enum_dict)
            #print(matches[col])
            newCol[col] = matches[col].map(enum_dict)
#print(newCol['__home_buildUpPlaySpeedClass'])
for colName in newCol.keys():
    matches[colName] = newCol[colName]
matches.to_csv('data_enumerated.csv')
matches


Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
376,1.73,3.60,4.75,1.75,3.40,4.25,1.75,3.3,3.80,1.66,...,45.0,1,0,60.0,0,70.0,1,70.0,2,1
378,3.50,3.30,2.10,3.45,3.20,2.00,3.40,3.2,1.90,3.20,...,65.0,1,0,70.0,1,70.0,1,65.0,1,1
381,2.40,3.25,2.90,2.35,3.15,2.80,2.20,3.1,2.80,2.30,...,50.0,1,0,70.0,1,70.0,1,70.0,2,1
404,1.44,4.50,7.00,1.35,4.40,7.50,1.40,3.9,6.00,1.33,...,55.0,1,0,70.0,1,70.0,1,70.0,2,1
405,3.00,3.25,2.38,2.75,3.10,2.40,2.45,3.1,2.45,2.60,...,60.0,1,0,70.0,1,50.0,2,70.0,2,1
406,2.15,3.25,3.40,1.95,3.30,3.30,2.10,3.2,2.90,2.00,...,50.0,1,0,65.0,0,65.0,2,70.0,2,1
407,2.80,3.25,2.50,2.40,3.30,2.65,2.40,3.2,2.40,2.50,...,60.0,1,0,70.0,1,65.0,2,70.0,2,1
408,1.50,4.00,7.00,1.40,3.90,7.50,1.50,3.7,5.00,1.40,...,50.0,1,0,60.0,0,60.0,2,65.0,1,1
409,2.80,3.25,2.50,2.80,3.15,2.35,2.40,3.2,2.40,2.50,...,50.0,1,0,60.0,0,70.0,1,60.0,1,1
410,1.70,3.60,5.00,1.65,3.40,4.90,1.70,3.4,3.80,1.66,...,55.0,1,0,65.0,0,60.0,2,70.0,2,1


In [12]:
# __home_buildUpPlaySpeedClass {'Balanced': 0, 'Slow': 1, 'Fast': 2}
# __home_buildUpPlayDribblingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __home_buildUpPlayPassingClass {'Mixed': 0, 'Long': 1, 'Short': 2}
# __home_buildUpPlayPositioningClass {'Organised': 0, 'Free Form': 1}
# __home_chanceCreationPassingClass {'Risky': 0, 'Normal': 1, 'Safe': 2}
# __home_chanceCreationCrossingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __home_chanceCreationShootingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __home_chanceCreationPositioningClass {'Organised': 0, 'Free Form': 1}
# __home_defencePressureClass {'Deep': 0, 'Medium': 1, 'High': 2}
# __home_defenceAggressionClass {'Double': 0, 'Contain': 1, 'Press': 2}
# __home_defenceTeamWidthClass {'Normal': 0, 'Wide': 1, 'Narrow': 2}
# __home_defenceDefenderLineClass {'Offside Trap': 0, 'Cover': 1}
# __away_buildUpPlaySpeedClass {'Balanced': 0, 'Slow': 1, 'Fast': 2}
# __away_buildUpPlayDribblingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __away_buildUpPlayPassingClass {'Mixed': 0, 'Long': 1, 'Short': 2}
# __away_buildUpPlayPositioningClass {'Organised': 0, 'Free Form': 1}
# __away_chanceCreationPassingClass {'Normal': 0, 'Risky': 1, 'Safe': 2}
# __away_chanceCreationCrossingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __away_chanceCreationShootingClass {'Normal': 0, 'Little': 1, 'Lots': 2}
# __away_chanceCreationPositioningClass {'Organised': 0, 'Free Form': 1}
# __away_defencePressureClass {'Deep': 0, 'Medium': 1, 'High': 2}
# __away_defenceAggressionClass {'Double': 0, 'Contain': 1, 'Press': 2}
# __away_defenceTeamWidthClass {'Normal': 0, 'Wide': 1, 'Narrow': 2}
# __away_defenceDefenderLineClass {'Offside Trap': 0, 'Cover': 1}

In [13]:
# Get rid of cols missing betting odds
to_remove = []
no_missing = matches.columns.values[:30]
for index, match in matches.iterrows():
    for col in no_missing: 
        if pd.isnull(match[col]):
            to_remove.append(index)
matches = matches.drop(to_remove, axis=0)
    

In [14]:
new_to_remove = []
for col in matches.columns.values:
    if re.search('date', col):
        new_to_remove.append(col)
matches = matches.drop(new_to_remove, axis=1)

In [15]:
# #TODO: Normalize columns
# ###################
# for 
#     x = df[['score']].values.astype(float)

#     # Create a minimum and maximum processor object
#     min_max_scaler = preprocessing.MinMaxScaler()

#     # Create an object to transform the data to fit minmax processor
#     x_scaled = min_max_scaler.fit_transform(x)

#     # Run the normalizer on the dataframe
#     df_normalized = pd.DataFrame(x_scaled)

In [16]:
matches.to_csv("full_betting_odds.csv")

In [17]:
#fill in missing data with na with -1
#CHANGE LATER TO BE MORE ROBUST
matches = matches.fillna(-1)
matches.to_csv('negative_one_fill.csv')
matches

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
997,2.38,3.25,3.00,2.45,3.30,2.70,2.30,3.10,2.60,2.38,...,50.0,1,0,45.0,0,52.0,2,52.0,1,1
998,1.80,3.60,4.33,1.75,3.70,4.20,1.80,3.20,3.70,1.73,...,54.0,1,0,43.0,0,44.0,2,50.0,1,1
999,1.65,3.80,5.00,1.67,3.60,5.00,1.65,3.30,4.40,1.67,...,50.0,1,0,44.0,0,55.0,2,53.0,1,1
1003,1.67,3.75,5.00,1.65,3.75,4.75,1.70,3.30,4.00,1.67,...,42.0,1,0,59.0,0,47.0,2,62.0,1,1
1004,1.73,3.60,4.75,1.75,3.60,4.00,1.70,3.40,4.20,1.73,...,56.0,1,0,47.0,0,45.0,2,55.0,1,1
1005,3.10,3.40,2.25,3.00,3.50,2.40,2.70,3.20,2.30,2.88,...,47.0,1,0,45.0,0,43.0,2,52.0,1,1
1006,3.20,3.40,2.20,2.95,3.50,2.10,3.10,3.20,2.10,2.88,...,47.0,1,0,48.0,0,45.0,2,52.0,1,1
1008,1.75,3.60,4.60,1.75,3.40,4.20,1.80,3.30,3.80,1.80,...,42.0,1,0,59.0,0,47.0,2,62.0,1,1
1009,1.67,3.60,5.50,1.70,3.50,4.40,1.70,3.40,4.20,1.67,...,61.0,1,0,45.0,0,49.0,2,65.0,1,1
1010,2.10,3.40,3.40,2.05,3.20,3.30,2.00,3.20,3.30,2.00,...,50.0,1,0,46.0,0,45.0,2,52.0,1,1


In [18]:
#get labels
new_to_remove = []
og = pd.read_sql_query("SELECT * from Match", con)
index = range(0,og.shape[0]) # number rows
columns = ['Home', 'Draw', 'Away']
labels =  pd.DataFrame(index=index, columns=columns)
print(index, matches.index.values)
for index, match in og.iterrows():
    if index in matches.index.values:
        if match['home_team_goal'] > match['away_team_goal']:
            labels.at[index, 'Home'] = 1
        elif match['home_team_goal'] == match['away_team_goal']:
            labels.at[index, 'Draw'] = 1
        else:
            labels.at[index, 'Away'] = 1
    else:
        new_to_remove.append(index)
labels = labels.drop(new_to_remove, axis=0)
labels = labels.fillna(0)
print(labels, labels.shape[0] == matches.shape[0])
labels.to_csv('labels.csv')

range(0, 25979) [  997   998   999 ... 23414 23415 23416]
       Home  Draw  Away
997       0     0     1
998       1     0     0
999       0     0     1
1003      0     1     0
1004      0     1     0
1005      1     0     0
1006      0     1     0
1008      1     0     0
1009      0     1     0
1010      0     0     1
1013      1     0     0
1014      0     0     1
1015      0     0     1
1016      1     0     0
1018      1     0     0
1019      0     0     1
1020      0     0     1
1021      1     0     0
1022      1     0     0
1023      0     0     1
1025      0     1     0
1026      1     0     0
1028      1     0     0
1029      0     0     1
1031      1     0     0
1033      1     0     0
1034      1     0     0
1035      1     0     0
1036      0     0     1
1037      1     0     0
...     ...   ...   ...
23387     0     1     0
23388     1     0     0
23389     1     0     0
23390     1     0     0
23391     1     0     0
23392     0     0     1
23393     0     0     1
23394 

In [19]:
#shuffle match rows so split tables are randomized
# matches = matches.reindex(np.random.permutation(matches.index))

matches.to_csv('cleaned_data.csv')
#split match data into training, validation, and test sets
# m_train = matches.iloc[:17861]
# m_valid = matches.iloc[17861:21108]
# m_test = matches.iloc[21108:]