# **PREPROCESS CATAPULT DATA**

- Adding session id to raw catapult data
- Process catapult data
    1. Concatenating all data per player
    2. Cleaning Hear rate feature
    3. Split concatenated dataset by player
    4. Split each player dataset by session
    5. Get max session length for each player
    6. Pad and normalize session

In [1]:
import pandas as pd
import os
import json
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Adding session id to raw catapult data 
Here we cut session when there is a 1 hour gap between catapult data.

In [None]:
def process_files(directory):
    file_list = os.listdir(directory)

    for file in file_list:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path, parse_dates=['hour-minutes-second'], date_parser=lambda x: pd.to_datetime(x, format='%H-%M-%S'))

        # Trier par heure
        df = df.sort_values(by='hour-minutes-second')

        # Créer des sessions basées sur l'écart d'une heure
        df['session_id'] = (df['hour-minutes-second'].diff().dt.seconds.fillna(0) > 3600).cumsum() + 1

        # Sauvegarder le fichier
        output_path = os.path.join("data/test/catapult_raw_with_session", file)
        df.to_csv(output_path, index=False)

In [None]:
process_files("../data/dataset_9/catapult_data")
process_files("../data/dataset_9/test_data/catapult_data")

# Useful functions

In [3]:
def drop_useless_column(df):
    return df.drop(['hour-minutes-second', "Centiseconds", "x", "y", "Metabolic power", 'Latitude', 'Longitude'], axis=1)

def concat_csv_files(folder_path):
    # Liste pour stocker les DataFrames de chaque fichier CSV
    dfs = []

    # Parcours de tous les fichiers du dossier
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            
            # Lecture du fichier CSV et ajout du DataFrame à la liste
            df = pd.read_csv(file_path)
            df = drop_useless_column(df)
            dfs.append(df)

    # Concaténation de tous les DataFrames en un seul
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df

def fill_missing_heart_rate_values(df):
    # Créer une série pour stocker les valeurs remplacées
    filled = df["Heart rate"].copy()

    # Identifier les cellules à remplir
    mask = filled < 2.0

    # Trouver les valeurs suivantes et précédentes valides
    next_valid = df.groupby('catapult_id')["Heart rate"].apply(lambda x: x.shift(-1).ffill())
    prev_valid = df.groupby('catapult_id')["Heart rate"].apply(lambda x: x.shift(1).bfill())

    # Choisir la valeur suivante ou précédente en fonction de la condition
    filled[mask] = next_valid[mask].where(next_valid[mask] > 2.0, prev_valid[mask])

    # Assurer que les cellules sans valeurs valides soient mises à 0
    filled.fillna(0.0, inplace=True)
    
    df['Heart rate'] = filled

def heart_rate(df):
    # Grouper par 'catapult_id' et calculer la valeur maximale de 'Heart rate' pour chaque groupe
    max_heart_rate = df.groupby('catapult_id')['Heart rate'].max()

    # Créer une nouvelle colonne 'Heart_rate_Max' dans le DataFrame initial
    df['Heart_rate_Max'] = df['catapult_id'].map(max_heart_rate)
    
    # Calculer le ratio de 'Heart rate' sur 'Heart_rate_Max' et créer une nouvelle colonne 'Heart_rate_Ratio'
    df['Heart_rate_Ratio'] = df['Heart rate'] / df['Heart_rate_Max']
    df.drop(["Heart rate", "Heart_rate_Max"], axis=1, inplace=True)
    return df

def create_player_dfs(player_info_data_path, concatenated_data):
    """On remplace les catapult_id par player_id correspondants et on crée un dataframe par player, stockés dans un dictionnaire dont les clefs sont les player_id."""
    player_info_data = pd.read_csv(player_info_data_path)
    catapult_id_player = dict(zip(player_info_data['catapult_id'], player_info_data['player_id']))
    concatenated_data['catapult_id'] = concatenated_data['catapult_id'].replace(catapult_id_player)

    players = concatenated_data.groupby('catapult_id')
    return {player_id: player_df.drop(["catapult_id"], axis=1) for player_id, player_df in players}

def seperate_by_session(players_df, out_path):
    """Open each player file and create a csv file for each player and each training session (defined by the date and session_id)"""
    file_list = players_df.keys()
    
    players_df_list = []
    for player_id in file_list:
        df = players_df[player_id]
        
        groups_by_session = df.groupby(['n_date', 'session_id'])

        for (date, session_id), group in groups_by_session:
            

            player_folder = os.path.join(out_path, str(player_id))

            if not os.path.exists(player_folder):
                os.mkdir(player_folder)
            
            group.to_csv(f'{player_folder}/{date}_session_{session_id}.csv', index=False)

def get_max_length_by_player(file_path, out_path):
    """Getting the max length of training session for each player, convert them to a multiple of 4000, store them in a dictionary, then save it as json"""

    player_files = os.listdir(file_path)

    max_length_by_player = {}

    for player_file in player_files:
        player_file_path = file_path + "/" + player_file
        session_training_file = os.listdir(player_file_path)
        max_length_by_player[player_file] = 0

        for session_csv in session_training_file:
            df = pd.read_csv(player_file_path + "/" + session_csv)
            if df.shape[0] > max_length_by_player[player_file]:
                max_length_by_player[player_file] = df.shape[0]


    for key in max_length_by_player.keys():
        max_length_by_player[key] = 4000 * (int(max_length_by_player[key] // 4000) + 1)

    with open(out_path, 'w') as fichier_json:
        json.dump(max_length_by_player, fichier_json)
    
    return max_length_by_player

def pad_session(session, max_size_session):
    nb_zeros = max_size_session - len(session)
    zero_rows = pd.DataFrame(0, index=range(nb_zeros), columns=session.columns)
    zero_rows["n_date"] = session["n_date"].iloc[0]
    zero_rows["session_id"] = session["session_id"].iloc[0]

    session = pd.concat([zero_rows, session], ignore_index=True)
    return session

def pad_and_normalize_all_sessions(file_path, out_path, max_length_by_player):
    """For each player, pad all session to the max length.
    Padding means filling the line with zeros at the beginning of the dataset, 
    to obtain a dataset of size max_length_by_player, constant for each player. """

    player_files = os.listdir(file_path)

    for player_file in player_files:
        player_file_path = file_path + "/" + player_file
        session_training_file = os.listdir(player_file_path)

        session_df_list = []
        for half_day_csv in session_training_file:
            df = pd.read_csv(player_file_path + "/" + half_day_csv)
            session_df_list.append(pad_session(df, max_length_by_player[player_file]))

        df_concat = pd.concat(session_df_list, axis=0)
        
        df_concat.reset_index(drop=True, inplace=True)
        
        scaler = StandardScaler()
        normalized_df = pd.DataFrame(scaler.fit_transform(df_concat.drop(["n_date", "session_id"], axis=1)), columns=df_concat.drop(["n_date", "session_id"], axis=1).columns)

        normalized_df["n_date"] = df_concat["n_date"]
        normalized_df["session_id"] = df_concat["session_id"]
        normalized_df.to_csv(out_path + player_file + ".csv", index=False)

# Process catapult data

We seperate the code in 5 parts to avoid kernel crash. If needed, there are commented lines for checkpoints.
1. Training data (~2 hours)
2. Test data (~20 min)

### 1. Training data (~2 hours)

1. Concatenating all data per player
2. Cleaning Hear rate feature
3. Split concatenated dataset by player
4. Split each player dataset by session
5. Get max session length for each player
6. Pad and normalize session

In [14]:
raw_data_path = "data/catapult_raw_with_session/"
session_path = 'data/catapult_by_player_by_session/'

concatenated_data = concat_csv_files(raw_data_path)

In [15]:
concatenated_data["Heart rate"].fillna(0)
concatenated_data = heart_rate(concatenated_data)
#concatenated_data.to_csv("data/checkpoints/concatenated_data.csv")

In [17]:
# Create catapult dataframes for each player, in a dictionary
players_df = create_player_dfs("../data/dataset_9/player_info.csv", concatenated_data)
#for key in players_df:
#    players_df[key].to_csv(f"data/checkpoints/player_df/{key}.csv")

In [18]:
# Seperate and save each session, for each player
seperate_by_session(players_df, session_path)

In [19]:
# Get the max length of training session for each player
max_length_by_player = get_max_length_by_player(session_path, session_path + 'max_length_by_player.json')

In [None]:
# Padding and normalizing each session and save final datasets 
pad_and_normalize_all_sessions(session_path, "data/catapult_by_player_normalized/", max_length_by_player)

### 2. Test data (~20 min)

In [None]:
raw_data_path = "data/test/catapult_raw_with_session/"
session_path = 'data/test/catapult_by_player_by_session/'

concatenated_data = concat_csv_files(raw_data_path)

In [None]:
concatenated_data["Heart rate"].fillna(0)
concatenated_data = heart_rate(concatenated_data)
#concatenated_data.to_csv("data/test/checkpoints/concatenated_data.csv")

In [None]:
# Create catapult dataframes for each player, in a dictionary
players_df = create_player_dfs("../data/dataset_9/player_info.csv", concatenated_data)
#for key in players_df:
#    players_df[key].to_csv(f"data/test/checkpoints/player_df/{key}.csv")

In [None]:
# Seperate and save each session, for each player
seperate_by_session(players_df, session_path)

In [None]:
# Get the max length of training session for each player
max_length_by_player = get_max_length_by_player(session_path, session_path + 'max_length_by_player.json')

In [None]:
# Padding and normalizing each session and save final datasets 
pad_and_normalize_all_sessions(session_path, "data/test/catapult_by_player_normalized/", max_length_by_player)