In [1]:
import pandas as pd
import json
from datetime import datetime, timedelta
import warnings
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)

# Read the "acwr-data"

In [2]:
def load_and_feature_engineering_acwr(file_path):
    # read the data 
    acwr = pd.read_csv(file_path)
    # transform the date format
    for i, val in enumerate(acwr['date']):
        acwr['date'][i] = datetime.utcfromtimestamp(val)
    
    # drop the wellness data
    try:
        acwr = acwr.drop(columns=
                 ["fatigue","hydration","mood","nutrition","readiness",
                  "sleepduration","sleepquality","soreness","stress","total"], axis=1)
    except:
        acwr = acwr
    
    # transform the date type
    acwr['formatteddate'] = pd.to_datetime(acwr['date'])
    acwr["formatteddate_dt"] = pd.to_datetime(acwr["formatteddate"],format="%d/%m/%Y")
    acwr['formatteddate'] = acwr["formatteddate_dt"].astype(str)
    
    return acwr
    

In [3]:
file_path = "acwr_updated.csv"
acwr = load_and_feature_engineering_acwr(file_path)
acwr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5292 entries, 0 to 5291
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   acute             5292 non-null   float64       
 1   acwr              5292 non-null   float64       
 2   chronic           5292 non-null   float64       
 3   condition         5205 non-null   object        
 4   date              5292 non-null   object        
 5   duration          5292 non-null   int64         
 6   load              5292 non-null   int64         
 7   rpe               5292 non-null   int64         
 8   type              5205 non-null   object        
 9   player_id         5292 non-null   object        
 10  name              5292 non-null   object        
 11  formatteddate     5292 non-null   object        
 12  formatteddate_dt  5292 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory usage: 537.6+ KB


In [4]:
numerical = ['acute', 'acwr', 'chronic', 'duration', 'load','rpe']
categorical = ['condition', 'type'] 
na_cols = ["type","condition"]

In [5]:
acwr[acwr.isna().any(axis=1)]
#len(acwr[acwr.isna().any(axis=1)])
#35

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt
3147,100.028229,1.943563,51.466417,,2021-08-31 02:10:29,60,300,5,,5f0592e2239906121f930929,Kota Kawase,2021-08-31 02:10:29,2021-08-31 02:10:29
3148,364.859214,1.171410,311.470227,,2021-08-31 02:10:29,90,540,6,,5f059326c37cc79a7d6a6ca9,james Ha,2021-08-31 02:10:29,2021-08-31 02:10:29
3149,275.873053,1.687937,163.437967,,2021-08-31 02:10:29,90,630,7,,5f059355c37cc79a7d6a6caa,Jack Sealy,2021-08-31 02:10:29,2021-08-31 02:10:29
3150,427.874266,1.318486,324.519490,,2021-08-31 02:10:29,90,810,9,,5f059378239906121f93092e,Xavi Cheng,2021-08-31 02:10:29,2021-08-31 02:10:29
3151,412.975247,1.299609,317.768832,,2021-08-31 02:10:29,90,810,9,,5f059388239906121f93092f,Paco Chan,2021-08-31 02:10:29,2021-08-31 02:10:29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4590,311.801311,1.032951,301.854833,,2021-12-09 04:46:12,60,360,6,,60f254fb854df5b9da1bac56,LucianoDa Silva,2021-12-09 04:46:12,2021-12-09 04:46:12
4591,340.585829,1.230432,276.801786,,2021-12-09 04:46:12,60,420,7,,6107d0a36f7b01ca9c0f6571,ChanKa Po,2021-12-09 04:46:12,2021-12-09 04:46:12
4592,319.912931,1.027532,311.341001,,2021-12-09 04:46:12,60,360,6,,61139a2f859c5efc737749ba,Ju Yingzhi,2021-12-09 04:46:12,2021-12-09 04:46:12
4593,294.441589,1.041749,282.641598,,2021-12-09 04:46:12,60,360,6,,611a41e057d5db652811c0f3,ChenChin Lung,2021-12-09 04:46:12,2021-12-09 04:46:12


In [6]:
#impute the missing 'condition' and 'type' with the most frequent values
acwr1 = acwr.fillna(acwr.mode().iloc[0])
acwr1

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt
0,180.000000,1.000000,180.000000,Artificial (Dry),2020-06-29 00:00:00,45,180,4,Practice (Field),5f014aa4b8bec1cf08f7ca0f,Zesh Rehman,2020-06-29 00:00:00,2020-06-29 00:00:00
1,325.000000,1.000000,325.000000,Artificial (Dry),2020-06-29 00:00:00,65,325,5,Practice (Field),5f0592cc239906121f930928,TszTo Choy,2020-06-29 00:00:00,2020-06-29 00:00:00
2,390.000000,1.000000,390.000000,Artificial (Dry),2020-06-29 00:00:00,65,390,6,Practice (Field),5f0592e2239906121f930929,Kota Kawase,2020-06-29 00:00:00,2020-06-29 00:00:00
3,325.000000,1.000000,325.000000,Artificial (Dry),2020-06-29 00:00:00,65,325,5,Practice (Field),5f059326c37cc79a7d6a6ca9,james Ha,2020-06-29 00:00:00,2020-06-29 00:00:00
4,195.000000,1.000000,195.000000,Artificial (Dry),2020-06-29 00:00:00,65,195,3,Practice (Field),5f059355c37cc79a7d6a6caa,Jack Sealy,2020-06-29 00:00:00,2020-06-29 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5287,87.924115,0.514831,170.782587,Mixed,2022-02-04 05:59:59,40,120,3,Quarantine - Run,60f254d8a1b3ec106cc401da,Khan Jahangir,2022-02-04 05:59:59,2022-02-04 05:59:59
5288,145.724642,0.652914,223.191242,Mixed,2022-02-04 05:59:59,40,240,6,Quarantine - Run,60f254fb854df5b9da1bac56,LucianoDa Silva,2022-02-04 05:59:59,2022-02-04 05:59:59
5289,134.414729,0.724831,185.442837,Mixed,2022-02-04 05:59:59,40,240,6,Quarantine - Run,6107d0a36f7b01ca9c0f6571,ChanKa Po,2022-02-04 05:59:59,2022-02-04 05:59:59
5290,91.835734,0.577736,158.958055,Mixed,2022-02-04 05:59:59,40,200,5,Quarantine - Run,61139a2f859c5efc737749ba,Ju Yingzhi,2022-02-04 05:59:59,2022-02-04 05:59:59


In [12]:
set(acwr[acwr["type"]=="Game"]["condition"])

{'Artificial (Dry)', 'Artificial (Wet)', 'Grass (Dry)', 'Grass (Wet)'}

In [13]:
set(acwr[acwr["type"]!="Game"]["condition"])

{'Artificial (Dry)',
 'Artificial (Wet)',
 'Grass (Dry)',
 'Grass (Wet)',
 'Gym/Indoors',
 'Hike',
 'Mixed',
 'Track',
 nan}

In [232]:
players = set(acwr["player_id"])

In [233]:
len(players)

21

# Read the "injurcsv"

In [7]:
def load_and_feature_engineering_injury(file_path):
    injury = pd.read_csv(file_path)
    for i, val in enumerate(injury['Injury Date']):
        injury['Injury Date'][i] = datetime.utcfromtimestamp(val)
    for i, val in enumerate(injury['Return Date']):
        injury['Return Date'][i] = datetime.utcfromtimestamp(val)
    
    injury['Injury Date'] = pd.to_datetime(injury['Injury Date'])#.dt.date()
    injury['Return Date'] = pd.to_datetime(injury['Return Date'])

    injury['FormattedInjuryDate'] = injury['Injury Date'].dt.date.astype(str)
    injury['FormattedReturnDate'] = injury['Return Date'].dt.date.astype(str)

    injury['FormattedInjuryDate_dt'] = pd.to_datetime(injury['FormattedInjuryDate'])
    injury['FormattedReturnDate_dt'] = pd.to_datetime(injury['FormattedReturnDate']) 
                                       
                                       
    injury = injury.drop(['Diagnosis','Notes'], axis=1)
    chronic_injury = injury[injury['Onset'] != 'Acute']
    return injury, chronic_injury

In [8]:
file_path = "injury_updated.csv"
injury, chronic_injury = load_and_feature_engineering_injury(file_path)

In [9]:
print(injury.info())
print(chronic_injury.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Activity                40 non-null     object        
 1   Classification          41 non-null     object        
 2   Injury Date             41 non-null     datetime64[ns]
 3   Location                39 non-null     object        
 4   OSIICs                  16 non-null     object        
 5   Onset                   41 non-null     object        
 6   Past Injury             41 non-null     object        
 7   Reoccurance             34 non-null     object        
 8   Return Date             41 non-null     datetime64[ns]
 9   Side                    38 non-null     object        
 10  Surgery                 41 non-null     object        
 11  u_id                    41 non-null     object        
 12  FormattedInjuryDate     41 non-null     object      

# Merge the acwr and injury data

In [14]:
outer_df = pd.merge(acwr1, injury,  
                        how='outer', left_on=['player_id','formatteddate'], right_on = ['u_id','FormattedInjuryDate'])

In [15]:
outer_df.shape

(5333, 29)

In [16]:
outer_df

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt,Activity,Classification,Injury Date,Location,OSIICs,Onset,Past Injury,Reoccurance,Return Date,Side,Surgery,u_id,FormattedInjuryDate,FormattedReturnDate,FormattedInjuryDate_dt,FormattedReturnDate_dt
0,180.0,1.0,180.0,Artificial (Dry),2020-06-29,45.0,180.0,4.0,Practice (Field),5f014aa4b8bec1cf08f7ca0f,Zesh Rehman,2020-06-29 00:00:00,2020-06-29,,,NaT,,,,,,NaT,,,,,,NaT,NaT
1,325.0,1.0,325.0,Artificial (Dry),2020-06-29,65.0,325.0,5.0,Practice (Field),5f0592cc239906121f930928,TszTo Choy,2020-06-29 00:00:00,2020-06-29,,,NaT,,,,,,NaT,,,,,,NaT,NaT
2,390.0,1.0,390.0,Artificial (Dry),2020-06-29,65.0,390.0,6.0,Practice (Field),5f0592e2239906121f930929,Kota Kawase,2020-06-29 00:00:00,2020-06-29,,,NaT,,,,,,NaT,,,,,,NaT,NaT
3,325.0,1.0,325.0,Artificial (Dry),2020-06-29,65.0,325.0,5.0,Practice (Field),5f059326c37cc79a7d6a6ca9,james Ha,2020-06-29 00:00:00,2020-06-29,,,NaT,,,,,,NaT,,,,,,NaT,NaT
4,195.0,1.0,195.0,Artificial (Dry),2020-06-29,65.0,195.0,3.0,Practice (Field),5f059355c37cc79a7d6a6caa,Jack Sealy,2020-06-29 00:00:00,2020-06-29,,,NaT,,,,,,NaT,,,,,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5328,,,,,NaT,,,,,,,,NaT,Competition,New Injury,2021-11-20 10:12:50,Competition - Domestic,,Acute,no,Primary Injury,2021-11-27 10:12:50,Left,no,60f25487854df5b9da1bac55,2021-11-20,2021-11-27,2021-11-20,2021-11-27
5329,,,,,NaT,,,,,,,,NaT,Competition,Recurrent Injury,2021-11-20 10:15:35,Competition - Domestic,,Acute,no,Primary Injury,2021-11-27 10:15:35,Left,no,60f253c0854df5b9da1bac54,2021-11-20,2021-11-27,2021-11-20,2021-11-27
5330,,,,,NaT,,,,,,,,NaT,General Training,Recurrent Injury,2021-11-29 06:29:03,DTE - Domestic,,Acute,no,Primary Injury,2021-12-01 06:29:03,Right,no,5f059378239906121f93092e,2021-11-29,2021-12-01,2021-11-29,2021-12-01
5331,,,,,NaT,,,,,,,,NaT,Competition,New Injury,2022-01-02 05:45:26,Competition - Domestic,,Acute,no,Primary Injury,2022-01-17 05:45:26,Left,no,5f059326c37cc79a7d6a6ca9,2022-01-02,2022-01-17,2022-01-02,2022-01-17


In [63]:
outer_df[outer_df['u_id']=='5f0592e2239906121f930929']

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt,Activity,Classification,Injury Date,Location,OSIICs,Onset,Past Injury,Reoccurance,Return Date,Side,Surgery,u_id,FormattedInjuryDate,FormattedReturnDate,FormattedInjuryDate_dt,FormattedReturnDate_dt
2176,468.0,1.27,368.0,Grass (Wet),2021-02-27 15:34:55,95.0,950.0,10.0,Game,5f0592e2239906121f930929,Kota Kawase,2021-02-27,2021-02-27,Competition,Recurrent Injury,2021-02-27 03:35:51,Competition - Domestic,AA1,Repetitive - Sudden Onset,yes,Primary Injury,2021-03-12 03:35:51,Right,no,5f0592e2239906121f930929,2021-02-27,2021-03-12,2021-02-27,2021-03-12
2727,276.0,0.85,326.0,Grass (Wet),2021-04-27 06:07:50,63.0,378.0,6.0,Practice (Field),5f0592e2239906121f930929,Kota Kawase,2021-04-27,2021-04-27,Competition,New Injury,2021-04-27 09:37:43,Competition - Domestic,TZ1,Acute,no,Primary Injury,2021-04-28 09:37:43,Left,no,5f0592e2239906121f930929,2021-04-27,2021-04-28,2021-04-27,2021-04-28
3466,,,,,NaT,,,,,,,,NaT,Competition,New Injury,2021-03-27 04:11:17,Competition - Domestic,KZZ,Acute,no,Secondary Injury,2021-04-01 04:11:17,Right,no,5f0592e2239906121f930929,2021-03-27,2021-04-01,2021-03-27,2021-04-01
3467,,,,,NaT,,,,,,,,NaT,Other,Recurrent Injury,2021-06-03 00:45:03,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-09-15 00:45:03,Right,yes,5f0592e2239906121f930929,2021-06-03,2021-09-15,2021-06-03,2021-09-15
3468,,,,,NaT,,,,,,,,NaT,Other,Recurrent Injury,2021-08-04 00:52:10,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:10,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
3469,,,,,NaT,,,,,,,,NaT,Other,Recurrent Injury,2021-08-04 00:52:14,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:14,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
3470,,,,,NaT,,,,,,,,NaT,Other,Recurrent Injury,2021-08-04 00:52:31,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:31,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
3471,,,,,NaT,,,,,,,,NaT,Other,Recurrent Injury,2021-08-04 00:52:45,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:45,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04


In [17]:
outer_df['player_id'].fillna(outer_df['u_id'], inplace=True)
outer_df['formatteddate'].fillna(outer_df['FormattedInjuryDate'], inplace=True)
outer_df['formatteddate_dt'] = pd.to_datetime(outer_df['formatteddate'])

fill_zero_cols = ["acute","acwr","chronic","duration","load","rpe"]
fill_none_cols = ['condition','type']

for col in outer_df.columns:
    if col in fill_zero_cols:
        outer_df[col] = outer_df[col].fillna(0)
    elif col in fill_none_cols:
        outer_df[col] = outer_df[col].fillna('None')
    else:
        outer_df[col] = outer_df[col]

In [18]:
outer_df[outer_df['u_id']=='5f0592e2239906121f930929']

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt,Activity,Classification,Injury Date,Location,OSIICs,Onset,Past Injury,Reoccurance,Return Date,Side,Surgery,u_id,FormattedInjuryDate,FormattedReturnDate,FormattedInjuryDate_dt,FormattedReturnDate_dt
5297,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-02-27,2021-02-27,Competition,Recurrent Injury,2021-02-27 03:35:51,Competition - Domestic,AA1,Repetitive - Sudden Onset,yes,Primary Injury,2021-03-12 03:35:51,Right,no,5f0592e2239906121f930929,2021-02-27,2021-03-12,2021-02-27,2021-03-12
5301,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-03-27,2021-03-27,Competition,New Injury,2021-03-27 04:11:17,Competition - Domestic,KZZ,Acute,no,Secondary Injury,2021-04-01 04:11:17,Right,no,5f0592e2239906121f930929,2021-03-27,2021-04-01,2021-03-27,2021-04-01
5305,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-04-27,2021-04-27,Competition,New Injury,2021-04-27 09:37:43,Competition - Domestic,TZ1,Acute,no,Primary Injury,2021-04-28 09:37:43,Left,no,5f0592e2239906121f930929,2021-04-27,2021-04-28,2021-04-27,2021-04-28
5308,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-06-03,2021-06-03,Other,Recurrent Injury,2021-06-03 00:45:03,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-09-15 00:45:03,Right,yes,5f0592e2239906121f930929,2021-06-03,2021-09-15,2021-06-03,2021-09-15
5311,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-08-04,2021-08-04,Other,Recurrent Injury,2021-08-04 00:52:10,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:10,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
5312,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-08-04,2021-08-04,Other,Recurrent Injury,2021-08-04 00:52:14,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:14,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
5313,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-08-04,2021-08-04,Other,Recurrent Injury,2021-08-04 00:52:31,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:31,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04
5314,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-08-04,2021-08-04,Other,Recurrent Injury,2021-08-04 00:52:45,Other,,Repetitive - Gradual Onset,yes,Primary Injury,2021-08-04 00:52:45,Right,yes,5f0592e2239906121f930929,2021-08-04,2021-08-04,2021-08-04,2021-08-04


In [19]:
outer_df["injury?"] = outer_df.apply(lambda row: 1 
                                 if ((row['player_id'] == row['u_id']) and (row['formatteddate'] == row['FormattedInjuryDate']))
                                 else 0,
                                axis = 1)

In [20]:
outer_df.shape

(5333, 30)

In [21]:
outer_df[outer_df["injury?"]==1].shape

(41, 30)

In [22]:
injured_exposure = outer_df[outer_df["injury?"]==1].sort_values(by=['formatteddate'])
injured_exposure

Unnamed: 0,acute,acwr,chronic,condition,date,duration,load,rpe,type,player_id,name,formatteddate,formatteddate_dt,Activity,Classification,Injury Date,Location,OSIICs,Onset,Past Injury,Reoccurance,Return Date,Side,Surgery,u_id,FormattedInjuryDate,FormattedReturnDate,FormattedInjuryDate_dt,FormattedReturnDate_dt,injury?
5292,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2020-11-02,2020-11-02,General Training,Recurrent Injury,2020-11-02 11:15:18,DTE - Domestic,GTX,Repetitive - Gradual Onset,yes,,2021-02-16 11:15:18,Right,no,5f059355c37cc79a7d6a6caa,2020-11-02,2021-02-16,2020-11-02,2021-02-16,1
5293,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f014aa4b8bec1cf08f7ca0f,,2020-11-22,2020-11-22,General Training,New Injury,2020-11-22 21:58:31,DTE - Domestic,QMT,Repetitive - Gradual Onset,no,,2020-12-22 21:58:31,Left,no,5f014aa4b8bec1cf08f7ca0f,2020-11-22,2020-12-22,2020-11-22,2020-12-22,1
5294,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0593d9c37cc79a7d6a6cae,,2020-12-21,2020-12-21,General Training,New Injury,2020-12-21 22:01:30,DTE - Domestic,QM1,Repetitive - Sudden Onset,no,,2021-01-25 22:01:30,Right,no,5f0593d9c37cc79a7d6a6cae,2020-12-21,2021-01-25,2020-12-21,2021-01-25,1
5295,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0593d9c37cc79a7d6a6cae,,2021-01-25,2021-01-25,General Training,Recurrent Injury,2021-01-25 22:06:39,DTE - Domestic,QM1,Repetitive - Sudden Onset,no,,2021-02-07 22:06:39,Right,no,5f0593d9c37cc79a7d6a6cae,2021-01-25,2021-02-07,2021-01-25,2021-02-07,1
5296,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f059326c37cc79a7d6a6ca9,,2021-02-01,2021-02-01,General Training,New Injury,2021-02-01 04:38:23,DTE - Domestic,KT9,Acute,no,Primary Injury,2021-03-05 04:38:23,Right,no,5f059326c37cc79a7d6a6ca9,2021-02-01,2021-03-05,2021-02-01,2021-03-05,1
5297,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-02-27,2021-02-27,Competition,Recurrent Injury,2021-02-27 03:35:51,Competition - Domestic,AA1,Repetitive - Sudden Onset,yes,Primary Injury,2021-03-12 03:35:51,Right,no,5f0592e2239906121f930929,2021-02-27,2021-03-12,2021-02-27,2021-03-12,1
5298,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0593f1c37cc79a7d6a6caf,,2021-03-10,2021-03-10,Competition,New Injury,2021-03-10 08:48:53,Competition - Domestic,TM1,Acute,no,,2021-03-10 08:48:53,Right,no,5f0593f1c37cc79a7d6a6caf,2021-03-10,2021-03-10,2021-03-10,2021-03-10,1
5299,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f05939a239906121f930930,,2021-03-17,2021-03-17,Competition,New Injury,2021-03-17 06:02:26,Competition - Domestic,ST2,Repetitive - Gradual Onset,no,Primary Injury,2021-03-27 06:02:26,Left,no,5f05939a239906121f930930,2021-03-17,2021-03-27,2021-03-17,2021-03-27,1
5300,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f05939a239906121f930930,,2021-03-19,2021-03-19,Competition,New Injury,2021-03-19 10:16:46,Competition - Domestic,,Acute,no,Primary Injury,2021-07-30 10:16:46,Left,no,5f05939a239906121f930930,2021-03-19,2021-07-30,2021-03-19,2021-07-30,1
5301,0.0,0.0,0.0,,NaT,0.0,0.0,0.0,,5f0592e2239906121f930929,,2021-03-27,2021-03-27,Competition,New Injury,2021-03-27 04:11:17,Competition - Domestic,KZZ,Acute,no,Secondary Injury,2021-04-01 04:11:17,Right,no,5f0592e2239906121f930929,2021-03-27,2021-04-01,2021-03-27,2021-04-01,1


In [23]:
outer_df.sort_values(by=['formatteddate']).to_csv('exposure_injury_all_updated.csv', header=True, index=False, encoding = 'utf-8')

# Some players' exposure "type" is different from injury "activity" :

# Read the combined dataset and deal with anomalies

In [24]:
injury_exposure = pd.read_csv('exposure_injury_all_updated.csv')

In [25]:
injury_exposure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5333 entries, 0 to 5332
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   acute                   5333 non-null   float64
 1   acwr                    5333 non-null   float64
 2   chronic                 5333 non-null   float64
 3   condition               5333 non-null   object 
 4   date                    5292 non-null   object 
 5   duration                5333 non-null   float64
 6   load                    5333 non-null   float64
 7   rpe                     5333 non-null   float64
 8   type                    5333 non-null   object 
 9   player_id               5333 non-null   object 
 10  name                    5292 non-null   object 
 11  formatteddate           5333 non-null   object 
 12  formatteddate_dt        5333 non-null   object 
 13  Activity                40 non-null     object 
 14  Classification          41 non-null     

In [26]:
injury_exposure = injury_exposure.drop(columns = ["date", "formatteddate_dt","Injury Date", "Return Date",
                                                  "u_id", "FormattedInjuryDate_dt","FormattedReturnDate_dt"])
injury_exposure.rename(columns = {"Reoccurance":"Reoccurence"})

cols = ["formatteddate","FormattedInjuryDate","FormattedReturnDate"]
for col in cols:
    injury_exposure[col] = pd.to_datetime(injury_exposure[col])

In [27]:
injury_exposure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5333 entries, 0 to 5332
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   acute                5333 non-null   float64       
 1   acwr                 5333 non-null   float64       
 2   chronic              5333 non-null   float64       
 3   condition            5333 non-null   object        
 4   duration             5333 non-null   float64       
 5   load                 5333 non-null   float64       
 6   rpe                  5333 non-null   float64       
 7   type                 5333 non-null   object        
 8   player_id            5333 non-null   object        
 9   name                 5292 non-null   object        
 10  formatteddate        5333 non-null   datetime64[ns]
 11  Activity             40 non-null     object        
 12  Classification       41 non-null     object        
 13  Location             39 non-null 

In [28]:
injury1_exposure = injury_exposure[injury_exposure["injury?"] == 1]

In [29]:
injury1_exposure.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 1033 to 5185
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   acute                41 non-null     float64       
 1   acwr                 41 non-null     float64       
 2   chronic              41 non-null     float64       
 3   condition            41 non-null     object        
 4   duration             41 non-null     float64       
 5   load                 41 non-null     float64       
 6   rpe                  41 non-null     float64       
 7   type                 41 non-null     object        
 8   player_id            41 non-null     object        
 9   name                 0 non-null      object        
 10  formatteddate        41 non-null     datetime64[ns]
 11  Activity             40 non-null     object        
 12  Classification       41 non-null     object        
 13  Location             39 non-null

In [30]:
players_inj_lst = []
inj_players = set(injury1_exposure["player_id"])
for p in inj_players:
    df = injury_exposure[injury_exposure["player_id"] == p].reset_index()
    players_inj_lst.append(df)
   

In [31]:
inj_players

{'5f014aa4b8bec1cf08f7ca0f',
 '5f0592cc239906121f930928',
 '5f0592e2239906121f930929',
 '5f059326c37cc79a7d6a6ca9',
 '5f059355c37cc79a7d6a6caa',
 '5f059378239906121f93092e',
 '5f059388239906121f93092f',
 '5f05939a239906121f930930',
 '5f0593d9c37cc79a7d6a6cae',
 '5f0593f1c37cc79a7d6a6caf',
 '5f86fa4805bc5268fb4a00a5',
 '60f253c0854df5b9da1bac54',
 '60f25487854df5b9da1bac55',
 '6107d0a36f7b01ca9c0f6571',
 '61139a2f859c5efc737749ba',
 '61162b72840a022569af263d',
 '611a41e057d5db652811c0f3'}

In [32]:
players_inj_lst[3][players_inj_lst[3]["injury?"]==1]

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?
89,4319,0.0,0.0,0.0,,0.0,0.0,0.0,,60f253c0854df5b9da1bac54,,2021-11-20,Competition,Recurrent Injury,Competition - Domestic,,Acute,no,Primary Injury,Left,no,2021-11-20,2021-11-27,1


In [33]:
pd.set_option('display.max_rows', None)
kota = players_inj_lst[3]
kota

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?
0,2855,450.0,1.0,450.0,Artificial (Dry),75.0,450.0,6.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-02 02:15:50,,,,,,,,,,NaT,NaT,0
1,2876,540.0,1.137255,474.827586,Grass (Wet),90.0,810.0,9.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-03 05:12:42,,,,,,,,,,NaT,NaT,0
2,2889,502.5,1.07148,468.977408,Artificial (Dry),65.0,390.0,6.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-04 05:04:46,,,,,,,,,,NaT,NaT,0
3,2903,511.875,1.080189,473.875518,Artificial (Dry),60.0,540.0,9.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-05 05:04:06,,,,,,,,,,NaT,NaT,0
4,2914,471.40625,1.013053,465.332379,Artificial (Dry),70.0,350.0,5.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-06 05:43:28,,,,,,,,,,NaT,NaT,0
5,2929,432.304688,0.950194,454.964628,Artificial (Dry),35.0,315.0,9.0,Game,60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-07 05:23:47,,,,,,,,,,NaT,NaT,0
6,2950,373.62854,0.889729,419.935167,Grass (Dry),85.0,765.0,9.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-10 02:33:22,,,,,,,,,,NaT,NaT,0
7,2958,460.221405,1.044464,440.629293,Artificial (Dry),80.0,720.0,9.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-11 02:33:22,,,,,,,,,,NaT,NaT,0
8,2986,427.666054,0.987682,432.999687,Artificial (Dry),55.0,330.0,6.0,Practice (Field),60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-12 04:48:21,,,,,,,,,,NaT,NaT,0
9,2994,470.74954,1.059014,444.51695,Artificial (Wet),60.0,600.0,10.0,Game,60f253c0854df5b9da1bac54,Dudu Nunes,2021-08-13 09:32:09,,,,,,,,,,NaT,NaT,0


In [34]:
def label_injury(row):
    if row["injury?"]==1:
        if (row["type"].startswith("Practice")) and (row["Activity"]=="General Training"):
            return False
        elif (row["type"]=="Game") and (row["Activity"]=="Competition"):
            return False
        else:
            return True
    else:
        return False
    
    
def posix_time(dt):
    return ((dt - datetime(1970, 1, 1)) / timedelta(seconds=1))/(24*60*60)

In [35]:
def player_handler(kota):
    kota['NeedActualInjuryDate'] = kota.apply(lambda row: label_injury(row), axis=1)
    kota['daydiff'] = kota['formatteddate'].apply(posix_time)

    kota_need = kota[kota["NeedActualInjuryDate"]==True]


    #activity_lst = ['Competition', 'General Training']
    #type_lst = ['Game', 'Practice']


    kota_activity_game_idx = kota_need.index[kota_need['Activity']=='Competition'].tolist()
    kota_activity_train_idx = kota_need.index[kota_need['Activity']=='General Training'].tolist()
    kota_activity_gt_idx = kota_activity_game_idx + kota_activity_train_idx
    kota_activity_other_idx = kota_need.index[(kota_need['Activity'] != 'Competition') & 
                                          (kota_need['Activity'] != 'General Training')].tolist()

    kota_type_game_idx = kota.index[kota['type']=='Game'].tolist()
    kota_type_train_idx = kota.index[kota['type'].apply(lambda row: row.startswith('Practice'))].tolist()

    res_lst = []
    for idx in kota_activity_game_idx:
        for idx2 in kota_type_game_idx:
            if idx2 <= idx:
                res = idx2
        res_lst.append(res)
    #print(res_lst)
    for idx in kota_activity_train_idx:
        for idx2 in kota_type_train_idx:
            if idx2 <= idx:
                res = idx2
        res_lst.append(res)
    #print(res_lst)

    
    
    actual_injury_dt = kota['FormattedInjuryDate'].tolist()

    for i in range(len(kota_activity_gt_idx)):
        diff = kota.loc[kota_activity_gt_idx[i]]['daydiff'] - kota.loc[res_lst[i]]['daydiff']
        if diff <= 7:
            actual_injury_dt[kota_activity_gt_idx[i]] = kota.loc[res_lst[i]]['formatteddate']
        else:
            j = kota_activity_gt_idx[i]-1
            #print(j)
            while (kota.loc[j]['type'] == 'Rehab') or (kota.loc[j]['type'] == 'Modified') or (kota.loc[j]['formatteddate'] == kota[kota['type']=='Rehab']['formatteddate'].tolist()[0]):
                j = j - 1 
                #print(j)
            actual_injury_dt[kota_activity_gt_idx[i]] = kota.loc[j]['formatteddate']
        
    for i in range(len(kota_activity_other_idx)):
        j = kota_activity_other_idx[i]-1
        while (kota.loc[j]['type'] == 'Rehab') or (kota.loc[j]['type'] == 'Modified') or (kota.loc[j]['formatteddate'] == kota[kota['type']=='Rehab']['formatteddate'].tolist()[0]):
            j = j - 1
        actual_injury_dt[kota_activity_other_idx[i]] = kota.loc[j]['formatteddate']
               
        
    kota['actualInjuryDate'] = actual_injury_dt
    return kota

In [36]:
players_inj_lst = []
inj_players = set(injury_exposure["player_id"])
for p in inj_players:
    df = injury_exposure[injury_exposure["player_id"] == p].reset_index()
    df_new = player_handler(df)
    players_inj_lst.append(df_new)

new_players_inj = pd.concat(players_inj_lst)

In [37]:
new_players_inj

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?,NeedActualInjuryDate,daydiff,actualInjuryDate
0,857,120.0,1.0,120.0,Artificial (Dry),30.0,120.0,4.0,Practice (Gym),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-15 07:15:47,,,,,,,,,,NaT,NaT,0,False,18550.302627,NaT
1,866,225.0,1.510417,148.965517,Artificial (Dry),60.0,420.0,7.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-15 08:17:54,,,,,,,,,,NaT,NaT,0,False,18550.345764,NaT
2,871,360.0,1.88038,191.450654,Artificial (Dry),85.0,765.0,9.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-16 07:27:52,,,,,,,,,,NaT,NaT,0,False,18551.311019,NaT
3,882,450.0,1.97453,227.902333,Artificial (Dry),80.0,720.0,9.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-17 08:40:40,,,,,,,,,,NaT,NaT,0,False,18552.361574,NaT
4,888,366.875,1.602559,228.930798,Artificial (Dry),65.0,455.0,7.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-19 10:34:28,,,,,,,,,,NaT,NaT,0,False,18554.440602,NaT
5,898,335.15625,1.459141,229.694191,Artificial (Dry),30.0,240.0,8.0,Practice (Gym),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-20 06:53:11,,,,,,,,,,NaT,NaT,0,False,18555.286933,NaT
6,908,459.15625,1.73988,263.901087,Artificial (Dry),62.0,736.0,8.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-20 06:58:27,,,,,,,,,,NaT,NaT,0,False,18555.29059,NaT
7,918,434.367188,1.605624,270.528599,Artificial (Dry),60.0,360.0,6.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-21 10:06:52,,,,,,,,,,NaT,NaT,0,False,18556.421435,NaT
8,930,476.525391,1.62383,293.457661,Artificial (Dry),67.0,603.0,9.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-22 07:41:05,,,,,,,,,,NaT,NaT,0,False,18557.320197,NaT
9,941,419.894043,1.445615,290.460581,Artificial (Dry),50.0,250.0,5.0,Practice (Field),5f86fa4805bc5268fb4a00a5,Toby Down,2020-10-23 06:53:32,,,,,,,,,,NaT,NaT,0,False,18558.287176,NaT


In [38]:
new_players_inj[new_players_inj['NeedActualInjuryDate']==1]

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?,NeedActualInjuryDate,daydiff,actualInjuryDate
142,2367,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-05,Competition,New Injury,Competition - Domestic,AV1,Acute,no,Primary Injury,Right,no,2021-04-05,2021-04-10,1,True,18722.0,2021-04-04 04:38:07
159,2539,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-23,Competition,New Injury,Competition - Domestic,NPM,Acute,no,Primary Injury,Left,no,2021-04-23,2021-04-28,1,True,18740.0,2021-04-18 10:12:37
21,3315,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-09-10,,New Injury,,,Acute,no,,,no,2021-09-10,2021-09-10,1,True,18880.0,2021-09-09 05:09:54
52,3876,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-10-24,Competition,New Injury,Competition - Domestic,,Acute,no,Primary Injury,Right,no,2021-10-24,2021-10-27,1,True,18924.0,2021-10-17 06:58:07
32,3547,0.0,0.0,0.0,,0.0,0.0,0.0,,611a41e057d5db652811c0f3,,2021-09-30,General Training,Recurrent Injury,DTE - Domestic,,Repetitive - Gradual Onset,no,Primary Injury,,no,2021-09-30,2021-10-12,1,True,18900.0,2021-09-27 04:49:57
124,1033,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2020-11-02,General Training,Recurrent Injury,DTE - Domestic,GTX,Repetitive - Gradual Onset,yes,,Right,no,2020-11-02,2021-02-16,1,True,18568.0,2020-10-31 09:01:22
285,2723,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-05-16,Competition,New Injury,Competition - Domestic,TM1,Acute,no,Primary Injury,Left,no,2021-05-16,2021-05-20,1,True,18763.0,2021-05-10 02:11:33
299,2859,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,,HV1,Acute,no,,Right,no,2021-08-03,2021-08-03,1,True,18842.0,2021-08-02 02:15:50
300,2860,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,DTE - Domestic,,Acute,no,Primary Injury,Right,no,2021-08-03,2021-08-25,1,True,18842.0,2021-08-02 02:15:50
302,3431,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059388239906121f93092f,,2021-09-21,General Training,New Injury,DTE - Domestic,,Repetitive - Sudden Onset,no,Primary Injury,Left,no,2021-09-21,2021-10-12,1,True,18891.0,2021-09-18 04:38:21


In [39]:
# deal with the special cases
new_players_inj.drop([277,278,279],inplace=True)
new_players_inj.at[276, 'actualInjuryDate'] = '2021-05-30'
new_players_inj[new_players_inj['NeedActualInjuryDate']==1]

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?,NeedActualInjuryDate,daydiff,actualInjuryDate
142,2367,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-05,Competition,New Injury,Competition - Domestic,AV1,Acute,no,Primary Injury,Right,no,2021-04-05,2021-04-10,1,True,18722.0,2021-04-04 04:38:07
159,2539,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-23,Competition,New Injury,Competition - Domestic,NPM,Acute,no,Primary Injury,Left,no,2021-04-23,2021-04-28,1,True,18740.0,2021-04-18 10:12:37
21,3315,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-09-10,,New Injury,,,Acute,no,,,no,2021-09-10,2021-09-10,1,True,18880.0,2021-09-09 05:09:54
52,3876,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-10-24,Competition,New Injury,Competition - Domestic,,Acute,no,Primary Injury,Right,no,2021-10-24,2021-10-27,1,True,18924.0,2021-10-17 06:58:07
32,3547,0.0,0.0,0.0,,0.0,0.0,0.0,,611a41e057d5db652811c0f3,,2021-09-30,General Training,Recurrent Injury,DTE - Domestic,,Repetitive - Gradual Onset,no,Primary Injury,,no,2021-09-30,2021-10-12,1,True,18900.0,2021-09-27 04:49:57
124,1033,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2020-11-02,General Training,Recurrent Injury,DTE - Domestic,GTX,Repetitive - Gradual Onset,yes,,Right,no,2020-11-02,2021-02-16,1,True,18568.0,2020-10-31 09:01:22
285,2723,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-05-16,Competition,New Injury,Competition - Domestic,TM1,Acute,no,Primary Injury,Left,no,2021-05-16,2021-05-20,1,True,18763.0,2021-05-10 02:11:33
299,2859,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,,HV1,Acute,no,,Right,no,2021-08-03,2021-08-03,1,True,18842.0,2021-08-02 02:15:50
300,2860,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,DTE - Domestic,,Acute,no,Primary Injury,Right,no,2021-08-03,2021-08-25,1,True,18842.0,2021-08-02 02:15:50
302,3431,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059388239906121f93092f,,2021-09-21,General Training,New Injury,DTE - Domestic,,Repetitive - Sudden Onset,no,Primary Injury,Left,no,2021-09-21,2021-10-12,1,True,18891.0,2021-09-18 04:38:21


In [40]:
new_players_inj[new_players_inj['injury?']==1]
#length = 25

Unnamed: 0,index,acute,acwr,chronic,condition,duration,load,rpe,type,player_id,name,formatteddate,Activity,Classification,Location,OSIICs,Onset,Past Injury,Reoccurance,Side,Surgery,FormattedInjuryDate,FormattedReturnDate,injury?,NeedActualInjuryDate,daydiff,actualInjuryDate
142,2367,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-05,Competition,New Injury,Competition - Domestic,AV1,Acute,no,Primary Injury,Right,no,2021-04-05,2021-04-10,1,True,18722.0,2021-04-04 04:38:07
159,2539,0.0,0.0,0.0,,0.0,0.0,0.0,,5f86fa4805bc5268fb4a00a5,,2021-04-23,Competition,New Injury,Competition - Domestic,NPM,Acute,no,Primary Injury,Left,no,2021-04-23,2021-04-28,1,True,18740.0,2021-04-18 10:12:37
21,3315,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-09-10,,New Injury,,,Acute,no,,,no,2021-09-10,2021-09-10,1,True,18880.0,2021-09-09 05:09:54
52,3876,0.0,0.0,0.0,,0.0,0.0,0.0,,61139a2f859c5efc737749ba,,2021-10-24,Competition,New Injury,Competition - Domestic,,Acute,no,Primary Injury,Right,no,2021-10-24,2021-10-27,1,True,18924.0,2021-10-17 06:58:07
32,3547,0.0,0.0,0.0,,0.0,0.0,0.0,,611a41e057d5db652811c0f3,,2021-09-30,General Training,Recurrent Injury,DTE - Domestic,,Repetitive - Gradual Onset,no,Primary Injury,,no,2021-09-30,2021-10-12,1,True,18900.0,2021-09-27 04:49:57
124,1033,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2020-11-02,General Training,Recurrent Injury,DTE - Domestic,GTX,Repetitive - Gradual Onset,yes,,Right,no,2020-11-02,2021-02-16,1,True,18568.0,2020-10-31 09:01:22
285,2723,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-05-16,Competition,New Injury,Competition - Domestic,TM1,Acute,no,Primary Injury,Left,no,2021-05-16,2021-05-20,1,True,18763.0,2021-05-10 02:11:33
299,2859,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,,HV1,Acute,no,,Right,no,2021-08-03,2021-08-03,1,True,18842.0,2021-08-02 02:15:50
300,2860,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059355c37cc79a7d6a6caa,,2021-08-03,General Training,New Injury,DTE - Domestic,,Acute,no,Primary Injury,Right,no,2021-08-03,2021-08-25,1,True,18842.0,2021-08-02 02:15:50
302,3431,0.0,0.0,0.0,,0.0,0.0,0.0,,5f059388239906121f93092f,,2021-09-21,General Training,New Injury,DTE - Domestic,,Repetitive - Sudden Onset,no,Primary Injury,Left,no,2021-09-21,2021-10-12,1,True,18891.0,2021-09-18 04:38:21


In [41]:
player = set(new_players_inj['player_id'])
print(player)
print(len(player))
injured_player = set(new_players_inj[new_players_inj['injury?']==1]['player_id'])
print(injured_player)

{'5f86fa4805bc5268fb4a00a5', '60f254ada1b3ec106cc401d9', '61139a2f859c5efc737749ba', '60f254d8a1b3ec106cc401da', '611a41e057d5db652811c0f3', '5f059355c37cc79a7d6a6caa', '5f059388239906121f93092f', '61162b72840a022569af263d', '60f254fb854df5b9da1bac56', '5f0593d9c37cc79a7d6a6cae', '5f059378239906121f93092e', '60f25487854df5b9da1bac55', '5f0593f1c37cc79a7d6a6caf', '5f05939a239906121f930930', '5f059326c37cc79a7d6a6ca9', '5f014aa4b8bec1cf08f7ca0f', '6153f4f14983cc4b427f2af2', '60f253c0854df5b9da1bac54', '5f0592e2239906121f930929', '5f0592cc239906121f930928', '6107d0a36f7b01ca9c0f6571'}
21
{'5f86fa4805bc5268fb4a00a5', '5f014aa4b8bec1cf08f7ca0f', '5f0593d9c37cc79a7d6a6cae', '60f253c0854df5b9da1bac54', '5f0592e2239906121f930929', '611a41e057d5db652811c0f3', '5f059378239906121f93092e', '5f059355c37cc79a7d6a6caa', '6107d0a36f7b01ca9c0f6571', '5f0592cc239906121f930928', '5f059388239906121f93092f', '60f25487854df5b9da1bac55', '5f0593f1c37cc79a7d6a6caf', '61139a2f859c5efc737749ba', '5f05939a239906

In [20]:
new_players_inj.to_csv('cleaned_player_inj.csv', header=True, index=False, encoding = 'utf-8')