### I- Importations 

In [1]:
import pandas as pd
import os
from datetime import datetime

### II- Import our CSV

We give the directory where all our data is stockage

In [2]:
DIR = "../data/dataset_9/"

In [3]:
data_anthropo = pd.read_csv(DIR+"data_anthropo.csv")
meteo_data = pd.read_csv(DIR+"meteo_data.csv")
player_info = pd.read_csv(DIR+"player_info.csv")
catapult_dir =  DIR+"test_data/catapult_data/"
RPE_TEST_FINAL = pd.read_csv(DIR+"test_data/RPE_TEST_FINAL.csv")

### III- Work on Data_anthropo

We value of the column date to datetime objects and after that we extract only the date component.

In [4]:
data_anthropo["n_date"] = pd.to_datetime(data_anthropo["n_date"]).dt.date

### IV- Work on Meteo_data

We create a new function that differentiates between morning and afternoon depending on the time of day

In [5]:
def half_day(h):
    if h>=datetime.strptime("12:00",'%H:%M') and h<datetime.strptime("23:00",'%H:%M') :
        return "PM"
    
    elif h>=datetime.strptime("08:00",'%H:%M') and h<datetime.strptime("12:00",'%H:%M'):
        return "AM"

In [6]:
def preprocess_meteo(meteo_data):
    '''
    Function to add a column with AM and PM and drop column Heure and is.day().
    '''
    meteo_data['Heure'] = pd.to_datetime(meteo_data['Heure'], format='%H:%M')
    
    meteo_data['AM/PM'] = meteo_data['Heure'].apply(half_day)
    
    meteo_data_processed = meteo_data.drop(['Heure', 'is_day ()'], axis=1)
    
    meteo_data_processed = meteo_data_processed.groupby(['time', 'AM/PM']).mean()
    
    return meteo_data_processed

### V- Work on Catapult_data

In [7]:
def concat_csv_files(folder_path):
    '''
    Function to load all Catapult files and concatenate them.
    '''    
    # We create a list to stock dataframes of each CSV
    dfs = []

    # We go throught all files of the folder.
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            
            # We read the csv and we add the dataframe to the list.
            df = pd.read_csv(file_path)
            dfs.append(df)

    # Concatenate all DataFrames into a single one
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df

In [8]:
def heart_rate(df):
    
    # Group by 'catapult_id' and calculate the maximum 'Heart rate' value for each group
    max_heart_rate = df.groupby('catapult_id')['Heart rate'].max()

    # Create a new 'Heart_rate_Max' column in the initial DataFrame
    df['Heart_rate_Max'] = df['catapult_id'].map(max_heart_rate)
    
    # Calculate the ratio of 'Heart rate' to 'Heart_rate_Max' and create a new 'Heart_rate_Ratio' column
    df['Heart_rate_Ratio'] = df['Heart rate'] / df['Heart_rate_Max']
    
    return df

In [9]:
def montpellier_or_not(df):
    """
    This function separates training sessions (and matches) that took place in Montpellier from those that did not.
    """
    # Create conditions to filter the DataFrame
    
    conditions = (
        (df['Latitude'] >= 43.37) & (df['Latitude'] <= 43.75) &
        (df['Longitude'] >= 3.52) & (df['Longitude'] <= 4.25)
    )

    # Filter the DataFrame and create a new DataFrame with the deleted rows
    montpellier_df = df[conditions]

    return montpellier_df

In [10]:
def create_trainning(data):
    """
    Function which formats time columns.
    """
    # Convert 'hour-minutes-second' column to 'H:M:S' format
    data['hour-minutes-second'] = data['hour-minutes-second'].astype(str).str.replace('-', ':')

    # Convert the 'hour-minutes-second' column into datetime format with separate hours, minutes and seconds
    data['hour-minutes-second'] = pd.to_datetime(data['hour-minutes-second'], format='%H:%M:%S', errors='coerce')

    # Ensure that the column is of type datetime64[ns].
    data['hour-minutes-second'] = pd.to_datetime(data['hour-minutes-second'], errors='coerce')

    data['Hour'] = data['hour-minutes-second'].dt.hour
    
    # Add column 'AM/PM'
    data['am-pm'] = 'AM' 
    data.loc[data['Hour'] >= 13, 'am-pm'] = 'PM'

    return data

In [11]:
def fill_missing_heart_rate_values(df):
    # Créer une série pour stocker les valeurs remplacées
    filled = df["Heart rate"].copy()

    # Identifier les cellules à remplir
    #mask = filled < 2.0
    mask = filled.reset_index(drop=True) < 2.0

    # Trouver les valeurs suivantes et précédentes valides
    next_valid = df.groupby('catapult_id')["Heart rate"].apply(lambda x: x.shift(-1).ffill())
    prev_valid = df.groupby('catapult_id')["Heart rate"].apply(lambda x: x.shift(1).bfill())

    # Choisir la valeur suivante ou précédente en fonction de la condition
    filled[mask] = next_valid[mask].where(next_valid[mask] > 2.0, prev_valid[mask])

    # Assurer que les cellules sans valeurs valides soient mises à 0
    filled.fillna(0.0, inplace=True)
    
    df['Heart rate'] = filled

In [12]:
def catapult_preprocess(folder_path):
    
    # Concatenate all catapults files  
    df = concat_csv_files(folder_path)
    
    # Drop column centisecond, x and y 
    df.drop(['Centiseconds','x','y'], axis=1, inplace=True)
    
    # Fill columns empty
    #fill_missing_heart_rate_values(df)

    # Create a new feature of Heart ratio, which will be more revelant 
    df_heart = heart_rate(df)
    
    # Drop column Heart rate Max, Heart rate, Latitude and Longitude
    df_heart.drop(['Heart_rate_Max', 'Heart rate', 'Latitude', 'Longitude'], axis=1, inplace=True)
    
    # Create a new feature of AM/PM to differentiate trainning of morning and evening
    df_train = create_trainning(df_heart)
    
    # Supprimer la colonne originale 'hour-minutes-second'
    df_train.drop(['Hour','hour-minutes-second'], axis=1, inplace=True)

    return df_train


In [13]:
folder_path = '../data/dataset_9/test_data/catapult_data/'

df = catapult_preprocess(folder_path)

In [14]:
df = df.groupby(['n_date', 'am-pm', 'catapult_id']).agg({
    'Odometer': 'last',
    'Velocity': 'mean',
    'Acceleration': 'mean',
    'PlayerLoad': 'last',
    'Metabolic power': 'mean',
    'Smooth Load': 'mean',
    'Heart_rate_Ratio': 'mean'
}).reset_index()

In [15]:
data_catapult_processed = pd.merge(df, player_info, on='catapult_id', how='inner')

In [16]:
RPE_TEST_FINAL["n_date"] = pd.to_datetime(RPE_TEST_FINAL["n_date"]).dt.date

In [17]:
RPE_TEST_FINAL["duration"].fillna(RPE_TEST_FINAL['duration'].mean(), inplace=True)

## Merging

In [18]:
RPE_TEST_FINAL['n_date'] = pd.to_datetime(RPE_TEST_FINAL["n_date"])

In [19]:
meteo_data_processed = preprocess_meteo(meteo_data)
meteo_data_processed = meteo_data_processed.reset_index()

In [20]:
meteo_data_processed['time'] = pd.to_datetime(meteo_data_processed['time'])

In [21]:
data_global = pd.merge(RPE_TEST_FINAL, meteo_data_processed, left_on=["n_date","am-pm"], right_on=["time","AM/PM"], how='left')

In [22]:
data_global = data_global.drop(["time", "AM/PM"], axis=1)

In [23]:
data_global['n_date'] = pd.to_datetime(data_global['n_date'])
data_anthropo['n_date'] = pd.to_datetime(data_anthropo['n_date'])
data_anthropo = data_anthropo.sort_values(['n_date', 'player_id'])

In [24]:
data_global = pd.merge_asof(data_global, data_anthropo, on='n_date', by='player_id', direction='forward')

In [25]:
data_catapult_processed['n_date'] = pd.to_datetime(data_catapult_processed['n_date'])

In [26]:
data_global = pd.merge(data_global, data_catapult_processed, on=['n_date', 'am-pm', 'player_id'], how='inner')

In [27]:
data_final = data_global.drop(["hour", "IS_TEST", 'team', 'catapult_id', 'am-pm', 'position_code', 'full_position_code'], axis=1)

### Previous RPE

In [28]:

def add_previous_rpe(rpe_data,n_previous):
    # Sort the DataFrame by 'player_id' and 'n_date' to ensure correct ordering
    rpe_data.sort_values(by=['player_id', 'n_date'], inplace=True)

    # Create new columns for n_previous previous RPE values
    for i in range(1, n_previous + 1):
        col_name = f'previous_RPE_{i}'
        rpe_data[col_name] = rpe_data.groupby('player_id')['RPE'].shift(i)
        rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')

    return rpe_data



In [29]:
data_final.fillna(0, inplace=True)
data_final.replace('#DIV/0!', 0, inplace=True)

In [30]:
# Spécifiez le chemin du fichier CSV où vous souhaitez sauvegarder le DataFrame
output_csv_path = "/home/docker/code_RF/dataset_rf_test"
# Utilisez la méthode to_csv pour sauvegarder le DataFrame dans un fichier CSV
data_final.to_csv(output_csv_path, index=False)

In [31]:
add_previous_rpe(data_final,5)
# Spécifiez le chemin du fichier CSV où vous souhaitez sauvegarder le DataFrame
output_csv_path = "/home/docker/code_RF/dataset_rf_test_with_5_previous"
# Utilisez la méthode to_csv pour sauvegarder le DataFrame dans un fichier CSV
data_final.to_csv(output_csv_path, index=False)

  rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')
  rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')
  rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')
  rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')
  rpe_data[col_name] = rpe_data.groupby('player_id')[col_name].fillna(method='ffill')
