In [134]:
# Imports
import sqlite3 as lite
import pandas as pd
import re
import numpy as np
import ast

In [135]:
conn = lite.connect('cycling_big.db')

riders_df = pd.read_sql_query('SELECT * FROM riders;', conn)
races_df = pd.read_sql_query('SELECT * FROM race_results', conn)

print("Amount of rows in races_df: ", races_df[races_df.columns[0]].count())
print("Amount of rows in riders_df: ", riders_df[riders_df.columns[0]].count())

conn.close()

"""
Convert the 'Date' column to DateTime format
Regarding races_df dataframe!
"""

races_df['Date'] = pd.to_datetime(races_df['Date'], errors='coerce', format='%d %B %Y')

# Remove rows where 'Date' is NaT
races_df = races_df.dropna(subset=['Date'])

# Normalize 'Date' to strip out time if it's present (this keeps just the date part)
races_df['Date'] = races_df['Date'].dt.normalize()

# Extract the month and year from the column and put them in their own columns
races_df['Month'] = races_df['Date'].dt.month
races_df['Year'] = races_df['Date'].dt.year

print("Unique values for 'months': ", races_df['Month'].unique(), "\n")
print("Unique values for 'years': ",races_df['Year'].unique())

"""
Converting the timetable to total seconds
"""

def time_to_seconds(time_str):
    # Remove commas and any spaces
    time_str = time_str.replace(',', '').strip()

    # Check the consistency of the time format using regular expressions
    match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', time_str)
    if match:
        hours, minutes, seconds = map(int, match.groups())
        total_seconds = hours * 3600 + minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan
        return total_seconds

    # Do the same as the loop above, but now for MM:SS format.
    match = re.match(r'(\d{1,2}):(\d{2})', time_str)
    if match:
        minutes, seconds = map(int, match.groups())
        total_seconds = minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan  
        return total_seconds

    # Do the same but for 0:00, 0:01, etc.
    match = re.match(r'(\d{1,2}):(\d{1,2})', time_str)
    if match:
        minutes, seconds = map(int, match.groups())
        total_seconds = minutes * 60 + seconds
        if total_seconds == 0:
            return np.nan 
        return total_seconds
    
    # If format doesn't match, return NaN
    return np.nan

# Apply the conversion function to the 'Time' column
races_df['Time_seconds'] = races_df['Time'].apply(time_to_seconds)

# print(races_df[['Time', 'Time_seconds']].head(10))
# races_df.info()

"""
Converting timelag to total seconds
"""

def timelag_to_seconds(timelag_str):
    # Check if the timelag_str contains missing values
    if pd.isna(timelag_str):
        return np.nan

    timelag_str = timelag_str.lstrip('+').strip()

    match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', timelag_str)
    if match:
        hours, minutes, seconds = map(int, match.groups())
        return hours * 3600 + minutes * 60 + seconds

    match = re.match(r'(\d{1,2}):(\d{2})', timelag_str)
    if match:
        minutes, seconds = map(int, match.groups())
        return minutes * 60 + seconds
    
    match = re.match(r'(\d+):(\d+)', timelag_str)
    if match:
        minutes, seconds = map(int, match.groups())
        return minutes * 60 + seconds

    return np.nan

races_df['Timelag_seconds'] = races_df['Timelag'].apply(timelag_to_seconds)

races_df['Timelag_seconds'] = races_df['Timelag_seconds'].replace(0.0, np.nan)

print(races_df[['Timelag', 'Timelag_seconds']].head(10))

"""
Converting distance into single numerical value
This means stripping 'km' from string and converting the remaining values into float64
"""

races_df['Length'] = races_df['Length'].str.replace(' km', '', regex=False)

# Convert to numeric and replace 0 with NaN
races_df['Length'] = pd.to_numeric(races_df['Length'], errors='coerce')  # Convert to numeric and handle errors

# Replace 0 values with NaN
races_df['Length'] = races_df['Length'].replace(0.0, np.nan)

# print(races_df['Length'])

"""
Splitting values from 'rdr' and putting the split values into separate columns
"""

# Function to convert the 'rdr' string to separate ranking columns
def extract_rankings(rdr_str):
    try:
        # Converting str to dict
        rankings = ast.literal_eval(rdr_str)
        
        # Extracting ranks, while also handling missing keys
        pcs_rnk = rankings.get('PCS Ranking', np.nan)
        uci_rnk = rankings.get('UCI World Ranking', np.nan)
        alltime_rnk = rankings.get('Specials | All Time Ranking', np.nan)
        
        return pd.Series([pcs_rnk, uci_rnk, alltime_rnk])
    except:
        return pd.Series([np.nan, np.nan, np.nan])

# Apply the function to the 'rdr' column
riders_df[['PCS_Rnk', 'UCI_Rnk', 'AllTime_Rnk']] = riders_df['rdr'].apply(extract_rankings)

# Entries should be turned into numeric values, where errors get turned into NaN
riders_df['PCS_Rnk'] = pd.to_numeric(riders_df['PCS_Rnk'], errors='coerce')
riders_df['UCI_Rnk'] = pd.to_numeric(riders_df['UCI_Rnk'], errors='coerce')
riders_df['AllTime_Rnk'] = pd.to_numeric(riders_df['AllTime_Rnk'], errors='coerce')

# print(riders_df[['fullname', 'PCS_Rnk', 'UCI_Rnk', 'AllTime_Rnk']])

"""
Convert stage types to binary with label encoding
"""

races_df['Stage_Type_bin'] = races_df['Stage_Type'].map({'RR': 0, 'ITT': 1})

"""
Splitting values from 'pps' and putting those values into separate columns.
"""

def extract_points(pps_str):
    try:
        points = ast.literal_eval(pps_str)
        
        day_pnt = np.nan if points.get('One day races', '0') == '0' else points.get('One day races', np.nan)
        gc_pnt = np.nan if points.get('GC', '0') == '0' else points.get('GC', np.nan)
        tt_pnt = np.nan if points.get('Time trial', '0') == '0' else points.get('Time trial', np.nan)
        sprint_pnt = np.nan if points.get('Sprint', '0') == '0' else points.get('Sprint', np.nan)
        climb_pnt = np.nan if points.get('Climber', '0') == '0' else points.get('Climber', np.nan)
        
        return pd.Series([day_pnt, gc_pnt, tt_pnt, sprint_pnt, climb_pnt])
    except:
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

riders_df[['Day_Pnt', 'GC_Pnt', 'TT_Pnt', 'Sprint_Pnt', 'Climb_Pnt']] = riders_df['pps'].apply(extract_points)

riders_df['Day_Pnt'] = pd.to_numeric(riders_df['Day_Pnt'], errors='coerce')
riders_df['GC_Pnt'] = pd.to_numeric(riders_df['GC_Pnt'], errors='coerce')
riders_df['TT_Pnt'] = pd.to_numeric(riders_df['TT_Pnt'], errors='coerce')
riders_df['Sprint_Pnt'] = pd.to_numeric(riders_df['Sprint_Pnt'], errors='coerce')
riders_df['Climb_Pnt'] = pd.to_numeric(riders_df['Climb_Pnt'], errors='coerce')


Amount of rows in races_df:  225918
Amount of rows in riders_df:  1042
Unique values for 'months':  [ 1  3  4  5  6  7  8  9 10] 

Unique values for 'years':  [2012 2014 2015 2017 2018 2020 2021]
  Timelag  Timelag_seconds
0   +0:00              NaN
1   +0:04              4.0
2   +0:06              6.0
3   +0:10             10.0
4   +0:10             10.0
5   +0:10             10.0
6   +0:10             10.0
7   +0:10             10.0
8   +0:10             10.0
9   +0:10             10.0


In [136]:
print(riders_df['fullname'])

0          BARDET Romain
1        DUMOULIN Samuel
2          GALLOPIN Tony
3          NAESEN Oliver
4          FRANK Mathias
              ...       
1037      TRONDSEN Trond
1038    VAN MELSEN Kévin
1039     BEULLENS Cédric
1040    DE WINTER Ludwig
1041      DELACROIX Théo
Name: fullname, Length: 1042, dtype: object


In [137]:
riders_df['fullname'] = riders_df['fullname'].str.replace(' ', '_') 
print(riders_df['fullname'])

0          BARDET_Romain
1        DUMOULIN_Samuel
2          GALLOPIN_Tony
3          NAESEN_Oliver
4          FRANK_Mathias
              ...       
1037      TRONDSEN_Trond
1038    VAN_MELSEN_Kévin
1039     BEULLENS_Cédric
1040    DE_WINTER_Ludwig
1041      DELACROIX_Théo
Name: fullname, Length: 1042, dtype: object


In [138]:
"""
Join the rider and race tables together, using the rider_id as an index
"""
print("Amount of rows in races_df post cleanup: ", races_df[races_df.columns[0]].count())
print("Amount of rows in riders_df post cleanup: ", riders_df[riders_df.columns[0]].count())

df = races_df.set_index('rider_id').join(riders_df.set_index('rider_id'), how = 'left')

print("Amount of rows in df: ", df[df.columns[0]].count())

"""
Dropping columns that are not needed for analysis
"""

# Note: fix the long list
df.drop(['Time', 'Timelag', 'rdr', 'pps', 'birthdate', 'rider_url', 'Race_url', 'Stage_url', 'Circuit', 'Race_Name', 'Stage_Name', 'Start', 'Finish', 'Category', 'Stage_Type'], axis=1, inplace=True)

# Additional drops:
df = df.drop(columns=['id', 'Team', 'Date', 'Race_ID', 'Stage_Number', 'Team'])
# Replace all remaining zero values with NaN
df = df.replace(0, np.nan)

print(df.isna().sum())
print("Amount of rows in df after cleanup: ", df[df.columns[0]].count())
print("\ndone")

Amount of rows in races_df post cleanup:  80174
Amount of rows in riders_df post cleanup:  1042
Amount of rows in df:  80174
Rnk                    0
GC                 10059
BiB                    0
Rider                  0
Age                    0
UCI                77527
Pnt                69767
Length              6495
Month                  0
Year                   0
Time_seconds       18435
Timelag_seconds    11212
Stage_Type_bin     72484
fullname           34208
team               34208
country            34208
height             34260
weight             34266
PCS_Rnk            45842
UCI_Rnk            46804
AllTime_Rnk        53300
Day_Pnt            34216
GC_Pnt             34208
TT_Pnt             34631
Sprint_Pnt         34342
Climb_Pnt          34365
dtype: int64
Amount of rows in df after cleanup:  80174

done


### PI 7: Part 3


#### 1.1 Appropriate Machine Learning Models
In this section, the selection of appropriate machine learning models in the context of the current dataset will be discussed.

#### 1.2.1 Categorical and Regression Trees
For the first model, the use of Categorical and Regression Trees will be discussed. As stated in *part 2* of the assignment, CART is a decision tree, that classifies records based on the conditions in the *decision nodes*, where the final classification, or regression, is determined in the *leaf nodes*. See the code from <code>pi7-2.ipynb</code> for a more in-depth explanation of decision trees.

#### 1.2.2 Argumentation
A regression tree will be implemented due to the following reasons:
- Handling possible non-linear relationships.
- It does not require any normalization/standardization, making outcomes more interpretable.
- It can handle outliers and missing values well.

Arguments against implementing regression trees:
- The more complex a tree becomes, the more prone it becomes to overfitting. There are methods for handling scenarios where overfitting can become an issue, like *pruning*.
- Sensitive to hyperparameters.
- Unsuitable for datasets with a large number of classes (see the arguments above).

#### 1.2.3 Implementation
Below is am implementation of a regression tree on the current dataset.

In [139]:
df.isnull().sum()

Rnk                    0
GC                 10059
BiB                    0
Rider                  0
Age                    0
UCI                77527
Pnt                69767
Length              6495
Month                  0
Year                   0
Time_seconds       18435
Timelag_seconds    11212
Stage_Type_bin     72484
fullname           34208
team               34208
country            34208
height             34260
weight             34266
PCS_Rnk            45842
UCI_Rnk            46804
AllTime_Rnk        53300
Day_Pnt            34216
GC_Pnt             34208
TT_Pnt             34631
Sprint_Pnt         34342
Climb_Pnt          34365
dtype: int64

In [140]:
df_dtc = df.drop(columns = ['Rnk', 'GC', 'BiB', 'UCI', 'Pnt', 'Stage_Type_bin', 'Rider', 'fullname', 'team', 'country'])
df_dtc.isnull().sum()
df_dtc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80174 entries, 659ed585810c65fe22255a5e4a9b7838 to 0292146b9196ec7a98903cb50dae48cd
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              80174 non-null  int64  
 1   Length           73679 non-null  float64
 2   Month            80174 non-null  int32  
 3   Year             80174 non-null  int32  
 4   Time_seconds     61739 non-null  float64
 5   Timelag_seconds  68962 non-null  float64
 6   height           45914 non-null  float64
 7   weight           45908 non-null  float64
 8   PCS_Rnk          34332 non-null  float64
 9   UCI_Rnk          33370 non-null  float64
 10  AllTime_Rnk      26874 non-null  float64
 11  Day_Pnt          45958 non-null  float64
 12  GC_Pnt           45966 non-null  float64
 13  TT_Pnt           45543 non-null  float64
 14  Sprint_Pnt       45832 non-null  float64
 15  Climb_Pnt        45809 non-null  float64
dtypes: fl

In [141]:
df_dtc = df_dtc.dropna()
df_dtc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12187 entries, a73d590113699f02caf57566c20a2ae7 to 4b35e4a129ae080c4ccec66ea79e0be3
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              12187 non-null  int64  
 1   Length           12187 non-null  float64
 2   Month            12187 non-null  int32  
 3   Year             12187 non-null  int32  
 4   Time_seconds     12187 non-null  float64
 5   Timelag_seconds  12187 non-null  float64
 6   height           12187 non-null  float64
 7   weight           12187 non-null  float64
 8   PCS_Rnk          12187 non-null  float64
 9   UCI_Rnk          12187 non-null  float64
 10  AllTime_Rnk      12187 non-null  float64
 11  Day_Pnt          12187 non-null  float64
 12  GC_Pnt           12187 non-null  float64
 13  TT_Pnt           12187 non-null  float64
 14  Sprint_Pnt       12187 non-null  float64
 15  Climb_Pnt        12187 non-null  float64
dtypes: fl

In [142]:
X = df_dtc.iloc[:, 0:15] # adjust row val
y = df_dtc.iloc[:, 15]

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.2)

In [144]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [145]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [146]:
dtc.fit(X_train, y_train)

In [147]:
y_pred = dtc.predict(X_test)

In [148]:
from sklearn.metrics import confusion_matrix 

print(confusion_matrix(y_test, y_pred))

[[ 7  0  0 ...  0  0  0]
 [ 0 21  0 ...  0  0  0]
 [ 0  0  8 ...  0  0  0]
 ...
 [ 0  0  0 ... 24  0  0]
 [ 0  0  0 ...  0 14  0]
 [ 0  0  0 ...  0  0 20]]


In [149]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# Due to the large number of objects, overall accuracy score is used
# print(classification_report(y_test, y_pred, zero_division=0)) 

accuracy:  0.9979491386382281


In [150]:
features = pd.DataFrame(dtc.feature_importances_, index= X.columns)

features.head(15)

Unnamed: 0,0
Age,0.00764
Length,0.0
Month,0.000546
Year,0.000198
Time_seconds,0.0
Timelag_seconds,0.0
height,0.051884
weight,0.102835
PCS_Rnk,0.066195
UCI_Rnk,0.13393


In [151]:
dtc2 = DecisionTreeClassifier(criterion= 'entropy', ccp_alpha = 0.04)

dtc2.fit(X_train, y_train)

In [152]:
y_pred2 = dtc2.predict(X_test)
print(confusion_matrix(y_test, y_pred2))

[[ 0  0  0 ...  0  0  0]
 [ 0 21  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0 20]]


In [153]:
#print(classification_report(y_test, y_pred2, zero_division=0))
accuracy = accuracy_score(y_test, y_pred2)
print("accuracy: ", accuracy)

accuracy:  0.27891714520098443


In [154]:
features2 = (pd.DataFrame(dtc2.feature_importances_, index=X.columns))
features2.head(15)

Unnamed: 0,0
Age,0.0
Length,0.0
Month,0.0
Year,0.0
Time_seconds,0.0
Timelag_seconds,0.0
height,0.012532
weight,0.01255
PCS_Rnk,0.18762
UCI_Rnk,0.062487


In [155]:
new_data = pd.DataFrame({'Age': [25], 'Length': [100], 'Month': [6], 'Year': [2022], 'Time_seconds': [3600], 'Timelag_seconds': [0], 'height': [180], 'weight': [70], 'PCS_Rnk': [10], 'UCI_Rnk': [20], 'AllTime_Rnk': [30], 'Day_Pnt': [40], 'GC_Pnt': [50], 'TT_Pnt': [60], 'Sprint_Pnt': [70]})

# Use the DTC model to make a prediction
prediction = dtc.predict(new_data)

# Print the prediction
print(prediction)

[4263.]


#### 1.2.4 Boosted Forest

#### 1.3 Gradient Boosting


#### 1.4 Support Vector Regression (SVR)

#### 2.1 Metrics for Models

#### 3.1 Cross-validation

#### 4.1 Ideal Hyperparameters

#### 5.1 Conclusions

#### Bibliography