In [None]:
import io
import boto3
from dotenv import dotenv_values
import re
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from ydata_profiling import ProfileReport

import sys
sys.path.append(str(Path().resolve().parent))
from utils.s3_upload import upload_dataframe_to_s3

In [None]:
pd.options.display.max_columns = None

# DATA_PATH = Path('../data')
# PROCESSED_DATA_PATH = DATA_PATH / 'processed'

# halfmarathon_df = pd.read_csv(PROCESSED_DATA_PATH / 'halfmarathon_data.csv')

In [None]:
env = dotenv_values(".env")

s3 = boto3.client(
    "s3",
    aws_access_key_id=env["AWS_ACCESS_KEY"],
    aws_secret_access_key=env["AWS_SECRET_KEY"],
    endpoint_url=env["AWS_ENDPOINT_URL"],
)

BUCKET_NAME = env["AWS_BUCKET_NAME"]

s3_key = f'data/processed/halfmarathon_data.csv'
response = s3.get_object(Bucket=BUCKET_NAME, Key=s3_key)
halfmarathon_df = pd.read_csv(io.BytesIO(response['Body'].read()), sep=';')

In [None]:
halfmarathon_df.sample(10)

In [None]:
column_mapping = {
    'Miejsce': 'rank_overall',
    'Numer startowy': 'bib_number',
    'Imię': 'first_name',
    'Nazwisko': 'last_name',
    'Miasto': 'city',
    'Kraj': 'country',
    'Drużyna': 'team',
    'Płeć': 'gender',
    'Płeć Miejsce': 'rank_gender',
    'Kategoria wiekowa': 'age_category',
    'Kategoria wiekowa Miejsce': 'rank_age_category',
    'Rocznik': 'birth_year',
    '5 km Czas': 'time_5k',
    '5 km Miejsce Open': 'rank_5k',
    '5 km Tempo': 'pace_5k',
    '10 km Czas': 'time_10k',
    '10 km Miejsce Open': 'rank_10k',
    '10 km Tempo': 'pace_10k',
    '15 km Czas': 'time_15k',
    '15 km Miejsce Open': 'rank_15k',
    '15 km Tempo': 'pace_15k',
    '20 km Czas': 'time_20k',
    '20 km Miejsce Open': 'rank_20k',
    '20 km Tempo': 'pace_20k',
    'Tempo Stabilność': 'pace_stability',
    'Czas': 'finish_time',
    'Tempo': 'average_pace',
    'year': 'halfmarathon_year'
}

halfmarathon_df.rename(columns=column_mapping, inplace=True)

In [None]:
halfmarathon_df = halfmarathon_df[halfmarathon_df['rank_overall'].notna()]

In [None]:
halfmarathon_df.isnull().sum()

In [None]:
halfmarathon_df[halfmarathon_df['birth_year'].isnull() & halfmarathon_df['age_category'].isnull()]

In [None]:
def extract_age_range(category):
    match = re.search(r'(\d{2})', str(category))
    if match:
        lower = int(match.group(1))
        upper = lower + 9
        return f'{lower}-{upper}'
    return None

halfmarathon_df['age_range'] = halfmarathon_df['age_category'].apply(extract_age_range)

In [None]:
# halfmarathon_df[halfmarathon_df['age_range'].isnull()]
halfmarathon_df = halfmarathon_df[halfmarathon_df['age_range'].notnull()]

In [None]:
# halfmarathon_df.isnull().sum()
halfmarathon_df = halfmarathon_df.drop(columns=['team'])

In [None]:
halfmarathon_df.isnull().sum()

In [None]:
# halfmarathon_df.info(max_cols=30)
halfmarathon_df.sample(10)

In [None]:
def float_minutes_to_seconds(pace):
    if pd.isnull(pace):
        return np.nan
    minutes = int(pace)
    seconds = (pace - minutes) * 60
    return round(minutes * 60 + seconds)

pace_columns = ['pace_5k', 'pace_10k', 'pace_15k', 'pace_20k', 'average_pace']

for col in pace_columns:
    halfmarathon_df[f'{col}_sec'] = halfmarathon_df[col].apply(float_minutes_to_seconds)
    halfmarathon_df.drop(columns=[f'{col}'], inplace=True)

In [None]:
halfmarathon_df.sample(10)
# halfmarathon_df.info(max_cols=30)

In [None]:
time_cols = ['time_5k', 'time_10k', 'time_15k', 'time_20k', 'finish_time']

for col in time_cols:
    halfmarathon_df[col] = pd.to_datetime(halfmarathon_df[col], format='%H:%M:%S', errors='coerce').dt.time

In [None]:
def time_to_seconds(t):
    if pd.isnull(t):
        return None
    return t.hour * 3600 + t.minute * 60 + t.second

for col in time_cols:
    halfmarathon_df[f'{col}_sec'] = halfmarathon_df[col].apply(time_to_seconds)
    halfmarathon_df.drop(columns=[f'{col}'], inplace=True)

In [None]:
# halfmarathon_df.describe().T

# plt.figure(figsize=(8, 6))
# sns.boxplot(x=halfmarathon_df['finish_time_sec'])

# plt.title('Boxplot of Finish Time (in seconds)')
# plt.xlabel('Finish Time (seconds)')
# plt.grid(True)
# plt.show()

# sns.boxplot(x='gender', y='finish_time_sec', data=halfmarathon_df)


fig = px.box(halfmarathon_df, y='finish_time_sec', title='Boxplot of Finish Time (in seconds)')
fig.update_layout(
    yaxis_title='Finish Time (seconds)',
    showlegend=False
)
fig.show()

In [None]:
len(halfmarathon_df)

In [None]:
Q1 = halfmarathon_df['finish_time_sec'].quantile(0.25)
Q3 = halfmarathon_df['finish_time_sec'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

halfmarathon_df = halfmarathon_df[
    (halfmarathon_df['finish_time_sec'] >= lower_bound) &
    (halfmarathon_df['finish_time_sec'] <= upper_bound)
]

In [None]:
halfmarathon_df.info()

In [None]:
halfmarathon_df['age'] = halfmarathon_df['halfmarathon_year'] - halfmarathon_df['birth_year']

In [None]:
# halfmarathon_df['age'].isnull().sum()
# halfmarathon_df['age_range'].unique()

def assign_age_from_range(age_range):
    if age_range == '20-29':
        return 24.5
    elif age_range == '30-39':
        return 34.5
    elif age_range == '40-49':
        return 44.5
    elif age_range == '50-59':
        return 54.5
    elif age_range == '60-69':
        return 64.5
    elif age_range == '70-79':
        return 74.5
    elif age_range == '80-89':
        return 84.5
    else:
        return None

halfmarathon_df['age'] = halfmarathon_df.apply(
    lambda row: assign_age_from_range(row['age_range']) if pd.isnull(row['age']) else row['age'],
    axis=1
)

In [None]:
columns_to_drop = [
    'rank_overall', 'rank_gender', 'rank_age_category', 'rank_5k', 'rank_10k', 'rank_15k', 'rank_20k',
    'bib_number', 'first_name', 'last_name',
    'city', 'country', 'age_category', 'age_range', 'birth_year', 'halfmarathon_year','average_pace_sec',
    'pace_10k_sec', 'pace_15k_sec', 'pace_20k_sec', 'time_10k_sec', 'time_15k_sec', 'time_20k_sec', 'pace_stability'
]

halfmarathon_df = halfmarathon_df.drop(columns=columns_to_drop)

In [None]:
halfmarathon_df['gender'] = halfmarathon_df['gender'].map({'K': 0, 'M': 1})

In [None]:
halfmarathon_profile = ProfileReport(halfmarathon_df, title="Halfmarathon Profiling Report", explorative=True)
halfmarathon_profile.to_file("../reports/profiling/halfmarathon_cleaned_report.html")

In [None]:
# halfmarathon_df.to_csv(PROCESSED_DATA_PATH / 'halfmarathon_cleaned.csv', index=False)

In [None]:
upload_dataframe_to_s3(
    df=halfmarathon_df,
    bucket=BUCKET_NAME,
    key='data/processed/halfmarathon_cleaned.csv'
)