In [1]:
# basic imports
import pandas as pd
import tqdm


In [2]:
# open and preprocess data
data = pd.read_csv('..\\final_dataset.csv')
data = data.dropna()
data = data.drop(['Unnamed: 0'], axis=1)
data
print(data.columns)

Index(['Age', 'Height', 'Weight', 'Centre-Back', 'Right Winger', 'Left Winger',
       'Goalkeeper', 'Central Midfield', 'Right-Back', 'Left-Back',
       ...
       'isMedium.14', 'isLong.14', 'foot', 'lower_leg', 'upper_leg',
       'upper_body', 'arms', 'head', 'minor', 'injury'],
      dtype='object', length=189)


In [3]:
# print(list(data.columns)[-8])
x_columns = list(data.columns)[:-8]
phys_data = x_columns[:16]
inj_hist_data = x_columns[16:-8]
y_columns = list(data.columns)[-8:]
# print(x_columns)
# print(phys_data)
# print(inj_hist_data)
# print(y_columns)
inj_length = 7

In [4]:
# get column bases for each injury history entry
#inj_hist_data
s = set()
for entry in inj_hist_data:
    base = entry.split('.')[0]
    s.add(base)
    
injury_fields = list(s)
injury_fields.sort() # sort just to standarize
print(injury_fields)

['isArms', 'isFootAnkle', 'isHead', 'isLong', 'isLowerLeg', 'isMedium', 'isMinor', 'isShort', 'isUpperBody', 'isUpperLeg', 'isVeryShort']


In [5]:
# get max number of injuries
ma = 0
for entry in inj_hist_data:
    try:
        num = int(entry.split(".")[-1])
    except:
        num = 0 # some don't have a number
    if num > ma:
        ma = num
max_num_injuries = ma
print("max number of injuries: {}".format(max_num_injuries))

max number of injuries: 14


In [6]:
all_data = []
# iterate over rows, convert into python dict objects
for index, row in data.iterrows():
    entry = {}
    # get physical attributes, position
    for c_name in phys_data:
        entry[c_name] = row[c_name]
    
    # get injury history and put into list
    ih = [] # players injury history
    for i in range(max_num_injuries):
        i_str = "." + str(i) if i > 0 else '' 
        injury = {}
        nonzero = False
        for i_f in injury_fields:
            injury[i_f] = row[i_f+i_str]
            if injury[i_f] is not 0:
                nonzero = True
        if nonzero:
            ih.append(injury)
    entry['history'] = ih
    all_data.append(entry)

print("number of entries: {}".format(len(all_data)))

number of entries: 1332


In [7]:
test_row = all_data[0] # for testing the next function
print(test_row)

{'Age': 30.0, 'Height': 193.0, 'Weight': 87.0, 'Centre-Back': 1.0, 'Right Winger': 0.0, 'Left Winger': 0.0, 'Goalkeeper': 0.0, 'Central Midfield': 0.0, 'Right-Back': 0.0, 'Left-Back': 0.0, 'Defensive Midfield': 0.0, 'Centre-Forward': 0.0, 'Attacking Midfield': 0.0, 'Left Midfield': 0.0, 'Right Midfield': 0.0, 'Second Striker': 0.0, 'history': [{'isArms': 0.0, 'isFootAnkle': 0.0, 'isHead': 0.0, 'isLong': 0.0, 'isLowerLeg': 0.0, 'isMedium': 0.0, 'isMinor': 0.0, 'isShort': 0.0, 'isUpperBody': 1.0, 'isUpperLeg': 0.0, 'isVeryShort': 1.0}, {'isArms': 0.0, 'isFootAnkle': 0.0, 'isHead': 0.0, 'isLong': 0.0, 'isLowerLeg': 0.0, 'isMedium': 0.0, 'isMinor': 0.0, 'isShort': 0.0, 'isUpperBody': 0.0, 'isUpperLeg': 0.0, 'isVeryShort': 1.0}, {'isArms': 0.0, 'isFootAnkle': 0.0, 'isHead': 0.0, 'isLong': 0.0, 'isLowerLeg': 0.0, 'isMedium': 0.0, 'isMinor': 0.0, 'isShort': 0.0, 'isUpperBody': 0.0, 'isUpperLeg': 0.0, 'isVeryShort': 1.0}, {'isArms': 0.0, 'isFootAnkle': 0.0, 'isHead': 0.0, 'isLong': 0.0, 'isLow

In [8]:
add_last = True

# a function that takes 1 player and their injury history and unrolls it into several rows. 
# if the player has 2 injuries (say a head and arm injury), then there will be 3 rows:
# 1) the player's physical traits, no injury history, and label of a head and arm injury
# 2) the player's physical traits, an injury history of a head injury, and a label of an arm injury (assuming the head is the first injury)
# 3) the player's physical traits, an injury history of a head and arm injury, and all 0s for the label
#     - this las row is to not bias the model to always output an injury, but we may want to remove it later 
def unroll_row(row):
    rows = []
    fut_inj = row['history'].copy()
    pass_inj = []
    
    for h in range(len(fut_inj)):
        new_row = {}
        # copy physical traits 
        # Come back and deal with age at some point
        for trait in phys_data:
            new_row[trait] = row[trait]
        
        # create y column based on injuries to come
        y = {k:0 for k in y_columns}
        for inj in fut_inj:
            y['foot'] = max(inj['isFootAnkle'], y['foot']) # limit to 1 or 0
            y['lower_leg'] = max(inj['isLowerLeg'], y['lower_leg'])
            y['upper_leg'] = max(inj['isUpperLeg'], y['upper_leg'])
            y['upper_body'] = max(inj['isUpperBody'], y['upper_body'])
            y['arms'] = max(inj['isArms'], y['arms'])
            y['head'] = max(inj['isHead'], y['head'])
            y['minor'] = max(inj['isMinor'], y['minor'])
            
        y['injury'] = min(y['foot'] + y['lower_leg'] + y['upper_leg'] + y['upper_body'] + y['arms'] + y['head'] + y['minor'], 1.0)
        for k in y.keys():
            new_row[k] = y[k]
        
        for i in range(len(pass_inj)):
            past_injury = pass_inj[i]
            for k in past_injury.keys():
                new_row[k + '.' + str(i)] = past_injury[k]
        this_inj = fut_inj.pop(0)
        pass_inj.append(this_inj)
        rows.append(new_row)
        
    if add_last:
        # add last row with 0 for injury chance, might have to come back and remove later
        new_row = {}
        for trait in phys_data:
            new_row[trait] = row[trait]
        new_row['foot'] = 0.0
        new_row['lower_leg'] = 0.0
        new_row['upper_leg'] = 0.0
        new_row['upper_body'] = 0.0
        new_row['arms'] = 0.0
        new_row['head'] = 0.0
        new_row['minor'] = 0.0
        new_row['injury'] = 0.0

        for i in range(len(pass_inj)):
            past_injury = pass_inj[i]
            for k in past_injury.keys():
                new_row[k + '.' + str(i)] = past_injury[k]
        rows.append(new_row)
    return rows
            
result = unroll_row(test_row)
print(result)
print(len(result))

[{'Age': 30.0, 'Height': 193.0, 'Weight': 87.0, 'Centre-Back': 1.0, 'Right Winger': 0.0, 'Left Winger': 0.0, 'Goalkeeper': 0.0, 'Central Midfield': 0.0, 'Right-Back': 0.0, 'Left-Back': 0.0, 'Defensive Midfield': 0.0, 'Centre-Forward': 0.0, 'Attacking Midfield': 0.0, 'Left Midfield': 0.0, 'Right Midfield': 0.0, 'Second Striker': 0.0, 'foot': 1.0, 'lower_leg': 0.0, 'upper_leg': 1.0, 'upper_body': 1.0, 'arms': 1.0, 'head': 0.0, 'minor': 1.0, 'injury': 1.0}, {'Age': 30.0, 'Height': 193.0, 'Weight': 87.0, 'Centre-Back': 1.0, 'Right Winger': 0.0, 'Left Winger': 0.0, 'Goalkeeper': 0.0, 'Central Midfield': 0.0, 'Right-Back': 0.0, 'Left-Back': 0.0, 'Defensive Midfield': 0.0, 'Centre-Forward': 0.0, 'Attacking Midfield': 0.0, 'Left Midfield': 0.0, 'Right Midfield': 0.0, 'Second Striker': 0.0, 'foot': 1.0, 'lower_leg': 0.0, 'upper_leg': 1.0, 'upper_body': 1.0, 'arms': 1.0, 'head': 0.0, 'minor': 1.0, 'injury': 1.0, 'isArms.0': 0.0, 'isFootAnkle.0': 0.0, 'isHead.0': 0.0, 'isLong.0': 0.0, 'isLowerLeg

In [9]:
# create new dataframe
df = pd.DataFrame(result)

# reorganize columns so labels are at end
cols = list(df.columns)
new_cols = cols[:16] + cols[24:] + cols[16:24]
df = df[new_cols]
# print(new_cols)

# print(len(all_data))
# print(all_data[1])
# print("----------------")
# print(unroll_row(all_data[1]))

for i in tqdm.tqdm(range(1, len(all_data))): # already did entry 0
#     print(i)
    unrolled = unroll_row(all_data[i])
    df2 = pd.DataFrame(unrolled)
    df2 = df2[new_cols]
    df = df.append(df2)

df = df.fillna(0)
# print(df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1331/1331 [00:46<00:00, 28.72it/s]


In [10]:
# all_rows = []
# for row in tqdm.tqdm(all_data):
#     all_rows += unroll_row(row)

# df_again = pd.DataFrame.from_dict(all_rows)
# cols = list(df_again.columns)
# print(cols)
df

Unnamed: 0,Age,Height,Weight,Centre-Back,Right Winger,Left Winger,Goalkeeper,Central Midfield,Right-Back,Left-Back,...,isUpperLeg.13,isVeryShort.13,foot,lower_leg,upper_leg,upper_body,arms,head,minor,injury
0,30.0,193.0,87.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
1,30.0,193.0,87.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
2,30.0,193.0,87.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
3,30.0,193.0,87.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
4,30.0,193.0,87.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,28.0,194.0,90.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,28.0,194.0,90.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,28.0,194.0,90.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,28.0,194.0,90.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# save as csv
df.to_csv('../data/reworked1.csv')

In [12]:
# print(data.columns)
# print(inj_hist_data)
df_nohist = data.drop(columns=inj_hist_data)
df_nohist = df_nohist.drop(columns=['isVeryShort.13', 'isShort.13', 'isMedium.13', 'isLong.13', 'isVeryShort.14', 'isShort.14', 'isMedium.14', 'isLong.14'])
# print(df_nohist.columns)
df_nohist.to_csv('../data/data_nohist.csv')

In [50]:
# new structure
# player phys data, injury counts, then labels
def test_function(row):
    new_d = {}
    d = row.to_dict()
    for p in phys_data:
        new_d[p] = d[p]
    for inj_f in injury_fields:
        s = 0
        for m in range(max_num_injuries):
            s += d[inj_f + "." + str(m)]
        new_d[inj_f] = s
        
    for y in y_columns:
        new_d[y] = d[y]
    new_d['injury'] = min(new_d['injury'], 1)
    return new_d
#     print(new_d)
#     frame = pd.DataFrame.from_dict(new_d)
# #     print(frame)
#     return frame
       
# new_strat = df.apply(test_function, axis=1)

# for i, row in df.iterrows():
#     print(test_function(row))
#     if i == 1:
#         break
#     print()
num_rows = len(df.index)
print(num_rows)
new_data = []
new_data.append(test_function(df.iloc[0]))
for i in tqdm.tqdm(range(1, num_rows)):
    new_data.append(test_function(df.iloc[i]))
    
new_strat = pd.DataFrame(new_data)
print(list(new_strat.columns))
new_strat = new_strat.drop(columns=['isLong', 'isMedium','isMinor','isShort','isVeryShort'])
print(list(new_strat.columns))

19980


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19979/19979 [00:10<00:00, 1946.38it/s]


['Age', 'Height', 'Weight', 'Centre-Back', 'Right Winger', 'Left Winger', 'Goalkeeper', 'Central Midfield', 'Right-Back', 'Left-Back', 'Defensive Midfield', 'Centre-Forward', 'Attacking Midfield', 'Left Midfield', 'Right Midfield', 'Second Striker', 'isArms', 'isFootAnkle', 'isHead', 'isLong', 'isLowerLeg', 'isMedium', 'isMinor', 'isShort', 'isUpperBody', 'isUpperLeg', 'isVeryShort', 'foot', 'lower_leg', 'upper_leg', 'upper_body', 'arms', 'head', 'minor', 'injury']
['Age', 'Height', 'Weight', 'Centre-Back', 'Right Winger', 'Left Winger', 'Goalkeeper', 'Central Midfield', 'Right-Back', 'Left-Back', 'Defensive Midfield', 'Centre-Forward', 'Attacking Midfield', 'Left Midfield', 'Right Midfield', 'Second Striker', 'isArms', 'isFootAnkle', 'isHead', 'isLowerLeg', 'isUpperBody', 'isUpperLeg', 'foot', 'lower_leg', 'upper_leg', 'upper_body', 'arms', 'head', 'minor', 'injury']


In [52]:
new_strat.to_csv("..\\data\\new_strat.csv")