# Load Data

In [1]:
import pandas as pd
import numpy as np

gps_link = 'https://www.dropbox.com/s/n7pvlxy60qwyy91/gps.csv?dl=1'
wellness_link = 'https://www.dropbox.com/s/170bc3dimgn8ru8/wellness.csv?dl=1'

gps_data = pd.read_csv(gps_link)
wellness_data = pd.read_csv(wellness_link)

# Clean Wellness Data

In [2]:
wellness_data.head()

Unnamed: 0,Date,PlayerID,Fatigue,Soreness,Desire,Irritability,BedTime,WakeTime,SleepHours,SleepQuality,MonitoringScore,Pain,Illness,Menstruation,Nutrition,NutritionAdjustment,USGMeasurement,USG,TrainingReadiness
0,2018-07-21,1,3,3,2,3,23:00:00,07:00:00,8.0,2,13,No,No,Yes,Excellent,Yes,No,,0%
1,2018-07-21,2,4,3,4,4,23:00:00,07:00:00,8.0,4,19,Yes,No,Yes,,,Yes,1.01,0%
2,2018-07-21,3,3,3,5,4,22:30:00,06:30:00,8.0,4,19,No,No,No,,,Yes,1.016,100%
3,2018-07-21,4,2,3,5,4,00:30:00,07:00:00,6.5,1,15,No,No,Yes,Excellent,Yes,Yes,1.025,95%
4,2018-07-21,5,5,3,4,4,23:45:00,07:00:00,7.25,4,20,No,No,No,Okay,Yes,Yes,1.022,100%


In [3]:
# converting bed time to a metric (the earlier the better)
def bedTimeConverter(cur):
    hour = int(cur[0:2])
    minute = int(cur[3:5])
    if hour > 12:
        result = (24 - hour) * 4 - minute/15
    elif hour >= 0:
        result = -4 * hour - minute/15
    return int(result)

# concert bed time
wellness_data['BedTime'] = wellness_data['BedTime'].apply(bedTimeConverter)

In [15]:
bedTimeConverter('23:00:00')

4

In [16]:
bedTimeConverter('20:00:00')

16

In [4]:
# check missing data by column
wellness_data.isna().sum(axis = 0)

Date                      0
PlayerID                  0
Fatigue                   0
Soreness                  0
Desire                    0
Irritability              0
BedTime                   0
WakeTime                  0
SleepHours                0
SleepQuality              0
MonitoringScore           0
Pain                      0
Illness                   0
Menstruation             16
Nutrition               837
NutritionAdjustment     745
USGMeasurement          168
USG                    4382
TrainingReadiness         0
dtype: int64

In [5]:
# Yes/No to 1/0
wellness_data['Pain'] = wellness_data['Pain'].map({'Yes': 1, 'No': 0})
wellness_data['Illness'] = wellness_data['Illness'].map({'Yes': 2, 'Slightly Off':1, 'No': 0})

# NutritionAdjustment
wellness_data['NutritionAdjustment'] = wellness_data['NutritionAdjustment'].map({'Yes': 1, 'No': 0})
# if missing -> 'No' # 845 missing
wellness_data['NutritionAdjustment'] = wellness_data['NutritionAdjustment'].fillna(0)

# Menstruation
wellness_data['Menstruation'] = wellness_data['Menstruation'].map({'Yes': 1, 'No': 0})
# if missing -> 'No' # 16 missing
wellness_data['Menstruation'] = wellness_data['Menstruation'].fillna(0)

# Nutrition
wellness_data['Nutrition'] = wellness_data['Nutrition'].map({'Excellent': 2, 'Okay':1, 'Poor': 0})
# if missing -> 'Okay' # 837 missing
wellness_data['Nutrition'] = wellness_data['Nutrition'].fillna(1) 

# USG - above 1.025 indicates mild dehydration
usg = np.array(wellness_data['USG'].values.tolist())
wellness_data['USG'] = np.where(usg > 1.025, 1, 0).tolist()
# if missing -> not dehydrated # 4382 missing
wellness_data['USG'] = wellness_data['USG'].fillna(0)

# TrainingReadiness - turns percentage into decimal
wellness_data['TrainingReadiness'] = wellness_data['TrainingReadiness'].str.slice(0, -1).astype(int)/100

# drop columns that are irrelevant
# WakeTime is captured in the BedTime and SleepHours
# MonitoringScore is the sum of 5 other scores
# USGMeasurements indicates if USG is measured
wellness_data.drop(['WakeTime', 'MonitoringScore', 'USGMeasurement'], axis=1, inplace=True)





In [6]:
# check missing data by column
wellness_data.isna().sum(axis = 0)

Date                   0
PlayerID               0
Fatigue                0
Soreness               0
Desire                 0
Irritability           0
BedTime                0
SleepHours             0
SleepQuality           0
Pain                   0
Illness                0
Menstruation           0
Nutrition              0
NutritionAdjustment    0
USG                    0
TrainingReadiness      0
dtype: int64

In [7]:
wellness_data.head()

Unnamed: 0,Date,PlayerID,Fatigue,Soreness,Desire,Irritability,BedTime,SleepHours,SleepQuality,Pain,Illness,Menstruation,Nutrition,NutritionAdjustment,USG,TrainingReadiness
0,2018-07-21,1,3,3,2,3,4,8.0,2,0,0,1.0,2.0,1.0,0,0.0
1,2018-07-21,2,4,3,4,4,4,8.0,4,1,0,1.0,1.0,0.0,0,0.0
2,2018-07-21,3,3,3,5,4,6,8.0,4,0,0,0.0,1.0,0.0,0,1.0
3,2018-07-21,4,2,3,5,4,-2,6.5,1,0,0,1.0,2.0,1.0,0,0.95
4,2018-07-21,5,5,3,4,4,1,7.25,4,0,0,0.0,1.0,1.0,0,1.0


In [21]:
# save adjusted wellness data
wellness_data.to_csv(dataPath + 'wellness_adj.csv')

# Extract performance metrics from GPS data

In [55]:
gps_data.head()

Unnamed: 0,GameID,Half,PlayerID,FrameID,Time,GameClock,Speed,AccelImpulse,AccelLoad,AccelX,AccelY,AccelZ,Longitude,Latitude
0,1,1,2,1,00:22:01,00:00:00,0.658334,0.611112,0.00533,0.1325,0.69875,0.565,55.466666,24.994873
1,1,1,2,2,00:22:01,00:00:00,0.594445,0.638889,0.006568,0.11125,0.92,0.70625,55.466666,24.994874
2,1,1,2,3,00:22:01,00:00:00,0.363889,2.305557,0.003114,0.01375,0.77,0.6775,55.466667,24.994874
3,1,1,2,4,00:22:01,00:00:00,0.444445,0.805556,0.002602,0.00625,0.88625,0.595,55.466667,24.994874
4,1,1,2,5,00:22:01,00:00:00,0.4,0.444445,0.003813,-0.0175,0.8575,0.57375,55.466668,24.994874


In [56]:
# Max acceleration impulse and load per player per game
max_impulse = gps_data.groupby(['GameID', 'PlayerID'], sort=False)['AccelImpulse'].max()
max_load = gps_data.groupby(['GameID', 'PlayerID'], sort=False)['AccelLoad'].max()

In [57]:
# Max of 5-second window (50 frames) moving average of speed 
ma_series = gps_data.groupby(['GameID', 'PlayerID']).Speed.rolling(window=50).mean()
max_ma = ma_series.groupby(['GameID', 'PlayerID']).max()

In [79]:
performance_data = pd.concat([max_impulse, max_load, max_ma], axis=1).reset_index()
performance_data.head()

Unnamed: 0,GameID,PlayerID,AccelImpulse,AccelLoad,Speed
0,1,2,5.972227,0.668839,7.128783
1,1,3,5.972227,1.072972,5.721449
2,1,4,5.944449,0.488534,6.553116
3,1,6,5.972227,0.426342,5.953394
4,1,7,5.972227,0.70717,6.664561


In [80]:
games_data = pd.read_csv(dataPath + 'games.csv')
games_data.head()

Unnamed: 0,GameID,Date,Tournament,TournamentGame,Team,Opponent,Outcome,TeamPoints,TeamPointsAllowed
0,1,2017-11-30,Dubai,1,Canada,Spain,W,19,0
1,2,2017-11-30,Dubai,2,Canada,Ireland,W,31,0
2,3,2017-11-30,Dubai,3,Canada,Fiji,W,31,14
3,4,2017-12-01,Dubai,4,Canada,France,W,24,19
4,5,2017-12-01,Dubai,5,Canada,Australia,L,7,25


In [81]:
performance_data = performance_data.merge(games_data[['GameID', 'Date']], left_on='GameID', right_on='GameID')
performance_data.head()

Unnamed: 0,GameID,PlayerID,AccelImpulse,AccelLoad,Speed,Date
0,1,2,5.972227,0.668839,7.128783,2017-11-30
1,1,3,5.972227,1.072972,5.721449,2017-11-30
2,1,4,5.944449,0.488534,6.553116,2017-11-30
3,1,6,5.972227,0.426342,5.953394,2017-11-30
4,1,7,5.972227,0.70717,6.664561,2017-11-30


In [82]:
# save adjusted gps data
performance_data.to_csv(dataPath + 'performance.csv')