In [47]:
from pathlib import Path

import pandas as pd
import numpy as np

In [48]:
INJURIES_FILE_PATH = Path(r"..\Data\riderInjuries.csv").resolve()
ILLNESSES_FILE_NAME = Path(r"..\Data\riderIllnesses.csv").resolve()

In [49]:
"""This module is responsible to prepare the rider injuries and rider illness data"""
import pandas as pd
from pathlib import Path

def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Handle missing values in the DataFrame.
    """
    # Fill disrupt and score columns with 0
    data['disrupt'] = data['disrupt'].fillna(0)
    data['score'] = data['score'].fillna(0)
    return data


def prepare_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare the input data for modeling.
    """
    # Remove empty columns
    data.dropna(axis=1, how='all', inplace=True)
    # chnege column 'disrupt' to int (i.e. "yes" to 1 and "no" to 0)
    data['disrupt'] = data['disrupt'].map({'yes': 1, 'no': 0})
    data['date'] = pd.to_datetime(data['date']).dt.date
    # rename 'rider' column to 'cyclist_id'
    data.rename(columns={'rider': 'cyclist_id'}, inplace=True)
    # change 'cyclist_id' to int64
    data['cyclist_id'] = data['cyclist_id'].astype('int64')
    # sort the data by date and rider
    data = data.sort_values(by=['cyclist_id', 'date']).reset_index(drop=True)
    return data


def aggregate_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate the data by cyclist_id and date.
    """
    # group by cyclist_id and date and sum the values
    agg_data = data.groupby(['cyclist_id', 'date']).agg({
        'score': 'sum',
        'disrupt': 'max'
    }).reset_index()
    return agg_data


def add_missing_days(health_data: pd.DataFrame, agg_workouts: pd.DataFrame) -> pd.DataFrame:
    def add_missing_days_per_cyclist(rider: int) -> pd.DataFrame:
        if rider not in agg_workouts['cyclist_id'].unique():
            return pd.DataFrame()
        rider_dates = agg_workouts[agg_workouts['cyclist_id'] == rider]['date']
        date_range = pd.date_range(start=rider_dates.min(), end=rider_dates.max()).date
        complete_df = pd.DataFrame({'date': date_range,
                                    'cyclist_id': [rider] * len(date_range)})
        merged_df = pd.merge(complete_df, health_data, on=['date', 'cyclist_id'], how='left')
        return merged_df
    
    return pd.concat([add_missing_days_per_cyclist(cyclist) for cyclist in health_data['cyclist_id'].unique()]).reset_index(drop=True)

In [50]:
injuries_data = pd.read_csv(INJURIES_FILE_PATH)
illnesses_data = pd.read_csv(ILLNESSES_FILE_NAME)

In [51]:
x = [653, 1054, 1116, 1304, 2239, 2434, 2460, 2581, 2671, 2748, 3666, 3982, 4533, 4865, 5393, 5428, 5799, 5832, 5925, 6163, 6312, 22629, 22630, 22631, 22632, 22633, 22634, 22635, 22636]

y = injuries_data['rider'].unique()
t = illnesses_data['rider'].unique()

print(f"Missing riders in injuries data: {[z for z in x if z not in y]}")
print(f"Missing riders in illnesses data: {[z for z in x if z not in t]}")

print(f"in injuries but not in agg: {[z for z in y if z not in x]}")
print(f"in illnesses but not in agg: {[z for z in t if z not in x]}")

print("length of agg: ", len(x))

Missing riders in injuries data: [2239, 5799, 5925, 6312, 22630]
Missing riders in illnesses data: [6312]
in injuries but not in agg: []
in illnesses but not in agg: [2580.0]
length of agg:  29


In [52]:
injuries_data = prepare_data(injuries_data)
illnesses_data = prepare_data(illnesses_data)

In [53]:
agg_data = pd.read_csv("..\Data\Cleaned_Agg_Workouts_2023.csv")

In [54]:
injuries_data = aggregate_data(injuries_data)
illnesses_data = aggregate_data(illnesses_data)

In [55]:
injuries_data = add_missing_days(injuries_data, agg_data)
illnesses_data = add_missing_days(illnesses_data, agg_data)

In [56]:
injuries_data = handle_missing_values(injuries_data)
illnesses_data = handle_missing_values(illnesses_data)

In [57]:
injuries_data

Unnamed: 0,date,cyclist_id,score,disrupt
0,2023-01-01,653,0.0,0.0
1,2023-01-02,653,0.0,0.0
2,2023-01-03,653,0.0,0.0
3,2023-01-04,653,0.0,0.0
4,2023-01-05,653,0.0,0.0
...,...,...,...,...
8372,2023-12-17,22636,0.0,0.0
8373,2023-12-18,22636,0.0,0.0
8374,2023-12-19,22636,0.0,0.0
8375,2023-12-20,22636,0.0,0.0
