In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
games = pd.read_csv('game_lineup.csv')
player_attributes = pd.read_csv('euro_player_test.csv')

In [None]:
games['game_result'].value_counts()

In [None]:
player_attributes = player_attributes.drop(['player_fifa_api_id', 'birthday', 'height', 'weight'], axis=1)

In [None]:
player_attributes = player_attributes[player_attributes['date'] != '188152']
player_attributes = player_attributes[player_attributes['date'] != '198717']

In [None]:
season = []
for (i, date) in enumerate(player_attributes['date']):
    d = date.split('/')
    if int(d[0]) <= 6:
        year = d[2].split(' ')[0]
        season.append(year+'/'+str(int(year)+1))
    else:
        season.append(str(int(year)-1)+'/'+year)

In [None]:
for (i,s) in enumerate(season):
    player_attributes.at[i,'date'] = s

In [None]:
player_attributes = player_attributes[:180352]

In [None]:
player_attributes = player_attributes.loc[player_attributes.reset_index().groupby(['date', 'player_name'])['overall_rating'].idxmax()]
player_attributes = player_attributes.sort_values(by=['player_name'])

In [None]:
player_to_attributes = {}
participating_players = games.drop(['season', 'game_result', 'away_team_goal', 
                                    'home_team_goal', 'away_team_name', 
                                    'home_team_name'], axis=1)
participating_players = participating_players[['away_player_1_name','away_player_2_name', u'away_player_3_name', u'away_player_4_name',
       u'away_player_5_name', u'away_player_6_name', u'away_player_7_name',
       u'away_player_8_name', u'away_player_9_name', u'away_player_10_name',
       u'away_player_11_name', u'home_player_1_name', u'home_player_2_name', u'home_player_3_name',
       u'home_player_4_name', u'home_player_5_name', u'home_player_6_name',
       u'home_player_7_name', u'home_player_8_name', u'home_player_9_name',
       u'home_player_10_name', u'home_player_11_name']].stack().value_counts().index.tolist()

In [None]:
work_rate = {'high': 90.0, 'medium': 75.0, 'low':60.0, 'None': 0.0, 'o':0.0, 'norm':75.0, 'ormal':75.0, '2':2.0,'1':10.0,
             'le':50.0, 'stoc':80.0, 'y':50.0, '0':0.0, '3':30.0, '4':40.0, '5':50.0, '6':60.0, '7':70.0, '8':80.0, '9':90.0,
             'ean':75.0, 'es':0.0,'tocky':80.0}
for player in participating_players:
    player_to_attributes[player] = {}
    all_seasons_attributes = player_attributes[player_attributes['player_name'] == player].values
    for (i, season) in enumerate(all_seasons_attributes[:, 0]):
        attributes = player_attributes[player_attributes['player_name'] == player].values[i]
        attributes = np.delete(attributes, [0,3, attributes.shape[0]-1])
        attributes[2] = work_rate[attributes[2]]
        attributes[3] = work_rate[attributes[3]]
        player_to_attributes[player][season] = list(attributes)
with open('player_to_attributes.json', 'w') as f:
    json.dump(player_to_attributes, f)

In [None]:
with open('player_to_attributes.json', 'r') as f:
    player_to_attributes = json.load(f)

In [None]:
labels = games['game_result'].values
seasons = games['season'].values
home_team = games['home_team_name'].values
away_team = games['away_team_name'].values
teams = list(set(home_team) | set(away_team))

team_mapping = {}
for i in range(len(teams)):
    team_mapping[teams[i]] = i
with open('team_ids.json', 'w') as f:
    json.dump(team_mapping, f)

lineups = games.drop(['season', 'game_result', 'away_team_goal', 'country',
            'home_team_goal', 'away_team_name', 'home_team_name'], axis=1).values

In [None]:
list(games['game_result'].values)

In [None]:
import h5py

total_games = len(list(games['game_result'].values))

h5file = h5py.File('dataset.hdf5', "w")
d_games = h5file.create_dataset(
       "games", (total_games, 22, 37), dtype='i')
d_home = h5file.create_dataset(
       "home", (total_games,), dtype='i')
d_away = h5file.create_dataset(
       "away", (total_games,), dtype='i')
d_labels = h5file.create_dataset(
       "results", (total_games,), dtype='i')

In [None]:
avg_player = np.zeros(37)
count = 0
for i in range(total_games):
    for player in lineups[i]:
        for s in player_to_attributes[player].keys():
            count += 1
            avg_player += player_to_attributes[player][s]
        
avg_player = np.around(avg_player / count)
avg_player

In [None]:
for i in range(total_games):
    game = np.zeros((22, 37))
    for (j, player) in enumerate(lineups[i]):
        if seasons[i] in player_to_attributes[player]:
            game[j] = player_to_attributes[player][seasons[i]]
        elif len(player_to_attributes[player].keys()) > 0:
            for s in player_to_attributes[player].keys():
                game[j] += player_to_attributes[player][s]
            game[j] /= len(player_to_attributes[player].keys())
        else:
            game[j] = avg_player
    d_games[i, :, :] = game
    d_labels[i] = labels[i]
    d_home[i] = team_mapping[home_team[i]]
    d_away[i] = team_mapping[away_team[i]]
    if i % 1000 == 0:
        print i
h5file.close()

In [None]:
h5file.close()