In [1]:
# Imports
import sqlite3 as lite
import pandas as pd
import re
import numpy as np
import ast

In [2]:
conn = lite.connect('cycling_big.db')

riders_df = pd.read_sql_query('SELECT * FROM riders;', conn)
races_df = pd.read_sql_query('SELECT * FROM race_results', conn)

print("Amount of rows in races_df: ", races_df[races_df.columns[0]].count())
print("Amount of rows in riders_df: ", riders_df[riders_df.columns[0]].count())

conn.close()

"""
Convert the 'Date' column to DateTime format
Regarding races_df dataframe!
"""

races_df['Date'] = pd.to_datetime(races_df['Date'], errors='coerce', format='%d %B %Y')

# Remove rows where 'Date' is NaT
races_df = races_df.dropna(subset=['Date'])

# Normalize 'Date' to strip out time if it's present (this keeps just the date part)
races_df['Date'] = races_df['Date'].dt.normalize()

# Extract the month and year from the column and put them in their own columns
races_df['Month'] = races_df['Date'].dt.month
races_df['Year'] = races_df['Date'].dt.year

print("Unique values for 'months': ", races_df['Month'].unique(), "\n")
print("Unique values for 'years': ",races_df['Year'].unique())

"""
Converting the timetable to total seconds
"""

def time_to_seconds(time_str):
    # Remove commas and any spaces
    time_str = time_str.replace(',', '').strip()

    # Check the consistency of the time format using regular expressions
    match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', time_str)
    if match:
        hours, minutes, seconds = map(int, match.groups())
        total_seconds = hours * 3600 + minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan
        return total_seconds

    # Do the same as the loop above, but now for MM:SS format.
    match = re.match(r'(\d{1,2}):(\d{2})', time_str)
    if match:
        minutes, seconds = map(int, match.groups())
        total_seconds = minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan  
        return total_seconds

    # Do the same but for 0:00, 0:01, etc.
    match = re.match(r'(\d{1,2}):(\d{1,2})', time_str)
    if match:
        minutes, seconds = map(int, match.groups())
        total_seconds = minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan 
        return total_seconds
    
    # If format doesn't match, return NaN
    return np.nan

# Apply the conversion function to the 'Time' column
races_df['Time_seconds'] = races_df['Time'].apply(time_to_seconds)

# print(races_df[['Time', 'Time_seconds']].head(10))
# races_df.info()

"""
Converting timelag to total seconds
"""

def timelag_to_seconds(timelag_str):
    # Check if the timelag_str contains missing values
    if pd.isna(timelag_str):
        return np.nan

    timelag_str = timelag_str.lstrip('+').strip()

    match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', timelag_str)
    if match:
        hours, minutes, seconds = map(int, match.groups())
        return hours * 3600 + minutes * 60 + seconds

    match = re.match(r'(\d{1,2}):(\d{2})', timelag_str)
    if match:
        minutes, seconds = map(int, match.groups())
        return minutes * 60 + seconds
    
    match = re.match(r'(\d+):(\d+)', timelag_str)
    if match:
        minutes, seconds = map(int, match.groups())
        return minutes * 60 + seconds

    return np.nan

races_df['Timelag_seconds'] = races_df['Timelag'].apply(timelag_to_seconds)

races_df['Timelag_seconds'] = races_df['Timelag_seconds'].replace(0.0, np.nan)

print(races_df[['Timelag', 'Timelag_seconds']].head(10))

"""
Converting distance into single numerical value
This means stripping 'km' from string and converting the remaining values into float64
"""

races_df['Length'] = races_df['Length'].str.replace(' km', '', regex=False)

# Convert to numeric and replace 0 with NaN
races_df['Length'] = pd.to_numeric(races_df['Length'], errors='coerce')  # Convert to numeric and handle errors

# Replace 0 values with NaN
races_df['Length'] = races_df['Length'].replace(0.0, np.nan)

# print(races_df['Length'])

"""
Splitting values from 'rdr' and putting the split values into separate columns
"""

# Function to convert the 'rdr' string to separate ranking columns
def extract_rankings(rdr_str):
    try:
        # Converting str to dict
        rankings = ast.literal_eval(rdr_str)
        
        # Extracting ranks, while also handling missing keys
        pcs_rnk = rankings.get('PCS Ranking', np.nan)
        uci_rnk = rankings.get('UCI World Ranking', np.nan)
        alltime_rnk = rankings.get('Specials | All Time Ranking', np.nan)
        
        return pd.Series([pcs_rnk, uci_rnk, alltime_rnk])
    except:
        return pd.Series([np.nan, np.nan, np.nan])

# Apply the function to the 'rdr' column
riders_df[['PCS_Rnk', 'UCI_Rnk', 'AllTime_Rnk']] = riders_df['rdr'].apply(extract_rankings)

# Entries should be turned into numeric values, where errors get turned into NaN
riders_df['PCS_Rnk'] = pd.to_numeric(riders_df['PCS_Rnk'], errors='coerce')
riders_df['UCI_Rnk'] = pd.to_numeric(riders_df['UCI_Rnk'], errors='coerce')
riders_df['AllTime_Rnk'] = pd.to_numeric(riders_df['AllTime_Rnk'], errors='coerce')

# print(riders_df[['fullname', 'PCS_Rnk', 'UCI_Rnk', 'AllTime_Rnk']])

"""
Convert stage types to binary with label encoding
"""

races_df['Stage_Type_bin'] = races_df['Stage_Type'].map({'RR': 0, 'ITT': 1})

"""
Splitting values from 'pps' and putting those values into separate columns.
"""

def extract_points(pps_str):
    try:
        points = ast.literal_eval(pps_str)
        
        day_pnt = np.nan if points.get('One day races', '0') == '0' else points.get('One day races', np.nan)
        gc_pnt = np.nan if points.get('GC', '0') == '0' else points.get('GC', np.nan)
        tt_pnt = np.nan if points.get('Time trial', '0') == '0' else points.get('Time trial', np.nan)
        sprint_pnt = np.nan if points.get('Sprint', '0') == '0' else points.get('Sprint', np.nan)
        climb_pnt = np.nan if points.get('Climber', '0') == '0' else points.get('Climber', np.nan)
        
        return pd.Series([day_pnt, gc_pnt, tt_pnt, sprint_pnt, climb_pnt])
    except:
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

riders_df[['Day_Pnt', 'GC_Pnt', 'TT_Pnt', 'Sprint_Pnt', 'Climb_Pnt']] = riders_df['pps'].apply(extract_points)

riders_df['Day_Pnt'] = pd.to_numeric(riders_df['Day_Pnt'], errors='coerce')
riders_df['GC_Pnt'] = pd.to_numeric(riders_df['GC_Pnt'], errors='coerce')
riders_df['TT_Pnt'] = pd.to_numeric(riders_df['TT_Pnt'], errors='coerce')
riders_df['Sprint_Pnt'] = pd.to_numeric(riders_df['Sprint_Pnt'], errors='coerce')
riders_df['Climb_Pnt'] = pd.to_numeric(riders_df['Climb_Pnt'], errors='coerce')


Amount of rows in races_df:  225918
Amount of rows in riders_df:  1042
Unique values for 'months':  [ 1  3  4  5  6  7  8  9 10] 

Unique values for 'years':  [2012 2014 2015 2017 2018 2020 2021]
  Timelag  Timelag_seconds
0   +0:00              NaN
1   +0:04              4.0
2   +0:06              6.0
3   +0:10             10.0
4   +0:10             10.0
5   +0:10             10.0
6   +0:10             10.0
7   +0:10             10.0
8   +0:10             10.0
9   +0:10             10.0


In [3]:
print(riders_df['fullname'])

0          BARDET Romain
1        DUMOULIN Samuel
2          GALLOPIN Tony
3          NAESEN Oliver
4          FRANK Mathias
              ...       
1037      TRONDSEN Trond
1038    VAN MELSEN Kévin
1039     BEULLENS Cédric
1040    DE WINTER Ludwig
1041      DELACROIX Théo
Name: fullname, Length: 1042, dtype: object


In [4]:
riders_df['fullname'] = riders_df['fullname'].str.replace(' ', '_') 
print(riders_df['fullname'])

0          BARDET_Romain
1        DUMOULIN_Samuel
2          GALLOPIN_Tony
3          NAESEN_Oliver
4          FRANK_Mathias
              ...       
1037      TRONDSEN_Trond
1038    VAN_MELSEN_Kévin
1039     BEULLENS_Cédric
1040    DE_WINTER_Ludwig
1041      DELACROIX_Théo
Name: fullname, Length: 1042, dtype: object


In [5]:
"""
Join the rider and race tables together, using the rider_id as an index
"""
print("Amount of rows in races_df post cleanup: ", races_df[races_df.columns[0]].count())
print("Amount of rows in riders_df post cleanup: ", riders_df[riders_df.columns[0]].count())

df = races_df.set_index('rider_id').join(riders_df.set_index('rider_id'), how = 'inner')

print("Amount of rows in df: ", df[df.columns[0]].count())

"""
Dropping columns that are not needed for analysis
"""

# Note: fix the long list
df.drop(['Time', 'Timelag', 'rdr', 'pps', 'birthdate', 'rider_url', 'Race_url', 'Stage_url', 'Circuit', 'Race_Name', 'Stage_Name', 'Start', 'Finish', 'Category', 'Stage_Type'], axis=1, inplace=True)

# Additional drops:
df = df.drop(columns=['id', 'Team', 'Date', 'Race_ID', 'Stage_Number', 'Team'])
# Replace all remaining zero values with NaN
df = df.replace(0, np.nan)

print(df.isna().sum())
print("Amount of rows in df after cleanup: ", df[df.columns[0]].count())
print("\ndone")

Amount of rows in races_df post cleanup:  80174
Amount of rows in riders_df post cleanup:  1042
Amount of rows in df:  45966
Rnk                    0
GC                  5600
BiB                    0
Rider                  0
Age                    0
UCI                44040
Pnt                39107
Length              5484
Month                  0
Year                   0
Time_seconds       10650
Timelag_seconds     6257
Stage_Type_bin     41132
fullname               0
team                   0
country                0
height                52
weight                58
PCS_Rnk            11634
UCI_Rnk            12596
AllTime_Rnk        19092
Day_Pnt                8
GC_Pnt                 0
TT_Pnt               423
Sprint_Pnt           134
Climb_Pnt            157
dtype: int64
Amount of rows in df after cleanup:  45966

done


### PI 7: Part 3


#### 1.1 Appropriate Machine Learning Models
In this section, the selection of appropriate machine learning models in the context of the current dataset will be discussed.

#### 1.2.1 Categorical and Regression Trees
For the first model, the use of Categorical and Regression Trees will be discussed. As stated in *part 2* of the assignment, CART is a decision tree, that classifies records based on the conditions in the *decision nodes*, where the final classification, or regression, is determined in the *leaf nodes*. See the code from <code>pi7-2.ipynb</code> for a more in-depth explanation of decision trees.

#### 1.2.2 Argumentation
A regression tree will be implemented due to the following reasons:
- Handling possible non-linear relationships.
- It does not require any normalization/standardization, making outcomes more interpretable.
- It can handle outliers and missing values well.

Arguments against implementing regression trees:
- The more complex a tree becomes, the more prone it becomes to overfitting. There are methods for handling scenarios where overfitting can become an issue, like *pruning*.
- Sensitive to hyperparameters.
- Unsuitable for datasets with a large number of classes (see the arguments above).

#### 1.2.3 Implementation
Below is am implementation of a regression tree on the current dataset.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45966 entries, 659ed585810c65fe22255a5e4a9b7838 to 0292146b9196ec7a98903cb50dae48cd
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Rnk              45966 non-null  object 
 1   GC               40366 non-null  float64
 2   BiB              45966 non-null  object 
 3   Rider            45966 non-null  object 
 4   Age              45966 non-null  int64  
 5   UCI              1926 non-null   float64
 6   Pnt              6859 non-null   float64
 7   Length           40482 non-null  float64
 8   Month            45966 non-null  int32  
 9   Year             45966 non-null  int32  
 10  Time_seconds     35316 non-null  float64
 11  Timelag_seconds  39709 non-null  float64
 12  Stage_Type_bin   4834 non-null   float64
 13  fullname         45966 non-null  object 
 14  team             45966 non-null  object 
 15  country          45966 non-null  object 
 16  heigh

In [7]:
df_dtc = df.drop(columns = ['UCI', 'Pnt', 'Stage_Type_bin'])
df_dtc = df_dtc.dropna()

In [8]:
df_dtc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12187 entries, a73d590113699f02caf57566c20a2ae7 to 4b35e4a129ae080c4ccec66ea79e0be3
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Rnk              12187 non-null  object 
 1   GC               12187 non-null  float64
 2   BiB              12187 non-null  object 
 3   Rider            12187 non-null  object 
 4   Age              12187 non-null  int64  
 5   Length           12187 non-null  float64
 6   Month            12187 non-null  int32  
 7   Year             12187 non-null  int32  
 8   Time_seconds     12187 non-null  float64
 9   Timelag_seconds  12187 non-null  float64
 10  fullname         12187 non-null  object 
 11  team             12187 non-null  object 
 12  country          12187 non-null  object 
 13  height           12187 non-null  float64
 14  weight           12187 non-null  float64
 15  PCS_Rnk          12187 non-null  float64
 16  UCI_R

In [9]:
X = df_dtc.iloc[:, 0:22] # adjust row val
y = df_dtc.iloc[:, 22]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.2)

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [12]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [13]:
print(df_dtc['fullname'])

rider_id
a73d590113699f02caf57566c20a2ae7         MORENO_Javier
9be030c6ab77e67c18c3ddc46be4036c    ROJAS_José_Joaquín
f0e3e679300f9caf7c477c1d3614a33b         MEYER_Cameron
4485ac2bf252ae4395f1b850fa9e2c76        DURBRIDGE_Luke
8b2ba2d2ad59c160774fc929f7f8a635        TRENTIN_Matteo
                                           ...        
8e8a5de5574e54c9a8ae5436158244ac           HAAS_Nathan
f91fee92ec50e1574bd4c5fe3e2e18d2          DENNIS_Rohan
b2a6130da47efd8e9946728441f7f16f          BAUHAUS_Phil
5dbb77082695488e013574823562ec57       HIVERT_Jonathan
4b35e4a129ae080c4ccec66ea79e0be3         BODNAR_Maciej
Name: fullname, Length: 12187, dtype: object


In [14]:
dtc.fit(X_train, y_train)

ValueError: could not convert string to float: 'BURGHARDT Marcus'

#### 1.2.4 Boosted Forest

#### 1.3 Gradient Boosting

#### 1.4 Support Vector Regression (SVR)

#### 2.1 Metrics for Models

#### 3.1 Cross-validation

#### 4.1 Ideal Hyperparameters

#### 5.1 Conclusions

#### Bibliography