# **PREPROCESS JM (JOUEUR-METEO) DATA**

- Useful functions
- Merging and normalization datasets 
- Split by player_id
- Remove inconsistent date data

In [1]:
import pandas as pd
import os
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import defaultdict

### Useful functions

In [5]:
def half_day(h):
    if h >= datetime.strptime("12:00",'%H:%M') and h < datetime.strptime("23:00",'%H:%M') :
        return "PM"
    
    elif h >= datetime.strptime("08:00",'%H:%M') and h < datetime.strptime("12:00",'%H:%M'):
        return "AM"
    

def preprocess_meteo(meteo_data):

    meteo_data['Heure'] = pd.to_datetime(meteo_data['Heure'], format='%H:%M')
    
    meteo_data['AM/PM'] = meteo_data['Heure'].apply(half_day)
    
    meteo_data_processed = meteo_data.drop(['Heure', 'is_day ()'], axis=1)
    
    meteo_data_processed = meteo_data_processed.groupby(['time', 'AM/PM']).mean()
    
    return meteo_data_processed


def merge_datasets(data_rpe, data_meteo, data_anthropo):

    # set the right types and indexes
    data_rpe['n_date'] = pd.to_datetime(data_rpe["n_date"])
    data_meteo.reset_index(inplace=True)
    data_meteo['time'] = pd.to_datetime(data_meteo['time'])

    # merge rpe and meteo
    merged_data=pd.merge(data_rpe,data_meteo, left_on=["n_date","am-pm"], right_on=["time","AM/PM"], how='left')
    merged_data.drop(["time", "AM/PM"], axis=1, inplace=True)


    merged_data['n_date'] = pd.to_datetime(merged_data['n_date'])
    data_anthropo['n_date'] = pd.to_datetime(data_anthropo['n_date'])
    data_anthropo = data_anthropo.sort_values(['n_date', 'player_id'])

    # merge with anthropo (approximative on date, exact on player_id)
    merged_data = pd.merge_asof(merged_data, data_anthropo, on='n_date', by='player_id', direction='forward')

    return merged_data

## Merging and normailze datasets 

In [7]:
DIR = "../data/dataset_9/"

def merge_and_normalize(is_train=True):
    TEST_DATA_DIR = ""  
    TEST_DIR = ""
    if not is_train : 
        TEST_DATA_DIR = "/test_data" 
        TEST_DIR = "/test" 

    data_anthropo = pd.read_csv(DIR + "data_anthropo.csv")
    meteo_data = pd.read_csv(DIR + "meteo_data.csv")
    RPE_TRAIN_FINAL = pd.read_csv(DIR + TEST_DATA_DIR + "/RPE_TEST_FINAL.csv")

    # Process weather data
    meteo_processed = preprocess_meteo(meteo_data)

    # Merging
    merged_df = merge_datasets(data_rpe=RPE_TRAIN_FINAL, data_meteo=meteo_processed, data_anthropo=data_anthropo)
    merged_df = merged_df.drop(["hour", "IS_TEST", 'team'], axis=1)
    merged_df = merged_df.replace('#DIV/0!', np.nan)

    # Normalization
    scaler = StandardScaler()
    data_final_copy = merged_df
    normalized_df = pd.DataFrame(scaler.fit_transform(merged_df.drop(["RPE",'player_id','n_date','am-pm'], axis=1)), columns=merged_df.drop(['RPE','player_id','n_date','am-pm'], axis=1).columns)
    normalized_df["RPE"] = merged_df["RPE"]
    normalized_df['player_id'] = merged_df['player_id']
    normalized_df['n_date'] = merged_df['n_date']
    normalized_df['am-pm'] = merged_df['am-pm']

    # Save
    normalized_df.to_csv("data" + TEST_DATA_DIR + "/processed_JM.csv", index=False)

In [None]:
merge_and_normalize(is_train=True)
merge_and_normalize(is_train=False)

## Split by player ID

In [None]:
def create_session_id(df):
    df['session_id'] = df.groupby('n_date').cumcount() + 1
    df.drop(['am-pm', 'player_id'], axis=1, inplace=True)


def split_by_player(in_path, out_path):

    df = pd.read_csv(in_path).fillna(0)
    group_by_player = df.groupby(['player_id'])
    jm_by_player_dict = {}

    for player_id, jm_data in group_by_player:   
        player_id = player_id[0]
        create_session_id(jm_data)

        jm_by_player_dict[player_id] = jm_data
        player_folder = os.path.join(out_path, str(player_id))

        if not os.path.exists(player_folder):
            os.mkdir(player_folder)
        
        jm_data.to_csv(f'{player_folder}/{player_id}.csv', index=False)
        
        print(f'{player_id}: {jm_data.shape[0]}')
    return jm_by_player_dict

In [None]:
jm_by_player_dict = split_by_player("data/processed_JM.csv", "data/jm_by_player/")
jm_by_player_dict_test = split_by_player("data/test/processed_JM.csv", "data/test/jm_by_player/")

## Remove inconsistent date data

In [None]:
# Step 1: Convert the list of file name (2021-02-10_session_1) to dict {"2021-02-10": 1}
def list_to_dictionary(file_list):
    date_dictionary = defaultdict(int)
    for file in file_list:
        date, session = file.split('_session_')
        if session.startswith('1'):
            date_dictionary[date] += 1
    return dict(date_dictionary)

# Here we delete dates that are not in both Catapult and JM data
def adjust_dataframe_and_keep_dates(df, date_dictionary):
    to_delete_dates = []
    for date, expected_count in date_dictionary.items():
        actual_count = df[df['n_date'] == date].shape[0]
        if actual_count != expected_count:
            to_delete_dates.append(date)  # Keep the date in a list
    for date in df['n_date'].tolist():
        if date not in date_dictionary.keys():
            to_delete_dates.append(date)
    for date in to_delete_dates:
        df = df[df['n_date'] != date]  # Remove all rows corresponding to this date

    return df, to_delete_dates


['2020-12-04', '2020-12-28', '2021-02-02', '2021-02-10', '2021-02-17', '2021-03-03', '2021-03-16', '2021-03-23', '2021-03-31', '2021-04-02', '2021-04-06', '2021-01-05', '2021-02-05', '2021-02-09', '2021-02-25', '2021-04-20', '2021-04-20']
17


### TRAIN REMOVE 

In [None]:
player_folder = os.listdir("data/catapult_by_player_by_session")

for player_id in player_folder:
    print(player_id)
    session_files = os.listdir("data/catapult_by_player_by_session/" + str(player_id))
    try:
        df = pd.read_csv(f"data/jm_by_player/{player_id}/{player_id}.csv")
    except:
        print(f"No player found: {player_id}")

    dictionnaire_dates = list_to_dictionary(session_files)
    df_ajuste, to_delete_dates = adjust_dataframe_and_keep_dates(df, dictionnaire_dates)
    
    catapult = pd.read_csv("data/catapult_by_player_normalized/" + player_id + ".csv")

    catapult = catapult[~catapult['n_date'].isin(to_delete_dates)]
    
    df_ajuste.to_csv("data/jm_clean_by_player/" + player_id + ".csv")
    catapult.to_csv("data/catapult_by_player_final/" + player_id + ".csv")

### TEST REMOVE 

In [None]:
player_folder = os.listdir("data/test/catapult_by_player_by_session")

for player_id in player_folder:
    #if (player_id in already_done or player_id == '27943'):
    #    continue
    print(player_id)
    session_files = os.listdir("data/test/catapult_by_player_by_session/" + str(player_id))
    df = pd.read_csv(f"data/test/jm_by_player/{player_id}/{player_id}.csv")

    dictionnaire_dates = list_to_dictionary(session_files)
    df_ajuste, to_delete_dates = adjust_dataframe_and_keep_dates(df, dictionnaire_dates)
    print(to_delete_dates)
    
    catapult = pd.read_csv("data/test/catapult_by_player_normalized/" + player_id + ".csv")
    print(catapult.shape)

    catapult = catapult[~catapult['n_date'].isin(to_delete_dates)]
    print(catapult.shape)
    
    df_ajuste.to_csv("data/test/jm_clean_by_player/" + player_id + ".csv")
    catapult.to_csv("data/test/catapult_by_player_final/" + player_id + ".csv")