In [1]:
# pip install duckdb==0.8.0

In [31]:
import duckdb 
connection = duckdb.connect()

In [32]:
import os
data_folder = 'data'

csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

relations = {}

for file in csv_files:
    file_path = os.path.join(data_folder, file)
    rel = duckdb.read_csv(file_path)
    relations[file] = rel

Now that all the csv files are properly loaded, raw data will be processed into useful tabular data.

### Data profiling

In [33]:
import pandas as pd

'''
This function extracts summary statistics 
and other descriptive values for datasets
'''

def data_profiling(dataset_path):
    # Load CSV into a Pandas DataFrame
    df = pd.read_csv(dataset_path)
    # Features (columns)
    print("Features:\n", df.columns.tolist())
    
    # Number of rows
    print(f"Number of rows: {len(df)}")
    
    # Summary statistics for each column
    for col in df.columns:
        print(f"\nFeature: {col}\n{df[col].describe()}")
    
    # Missing values
    print("\nMissing values per column:\n", df.isnull().sum())
    
    # Unique values
    print("\nUnique values per column:")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")
        
    # Duplicate rows
    print("\nDuplicate rows:", df.duplicated().sum())

In [17]:
datasets = [dataset for dataset in relations]
datasets

['circuits.csv',
 'status.csv',
 'lap_times.csv',
 'sprint_results.csv',
 'drivers.csv',
 'races.csv',
 'constructors.csv',
 'constructor_standings.csv',
 'qualifying.csv',
 'driver_standings.csv',
 'constructor_results.csv',
 'pit_stops.csv',
 'seasons.csv',
 'results.csv']

### Circuits

In [18]:
circuits = pd.read_csv("data/circuits.csv")
data_profiling("data/circuits.csv")

Features:
 ['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng', 'alt', 'url']
Number of rows: 77

Feature: circuitId
count    77.000000
mean     39.883117
std      23.001701
min       1.000000
25%      20.000000
50%      40.000000
75%      59.000000
max      80.000000
Name: circuitId, dtype: float64

Feature: circuitRef
count              77
unique             77
top       albert_park
freq                1
Name: circuitRef, dtype: object

Feature: name
count                                 77
unique                                77
top       Albert Park Grand Prix Circuit
freq                                   1
Name: name, dtype: object

Feature: location
count            77
unique           75
top       Barcelona
freq              2
Name: location, dtype: object

Feature: country
count      77
unique     35
top       USA
freq       11
Name: country, dtype: object

Feature: lat
count    77.000000
mean     33.442925
std      22.808866
min     -37.849700
25%      32

Every circuit has a unique id and name, location, and its own latitude, longitude and altitude, the latter which is aerodinamically important and contextualizes the model.

All values seem to be within reasonable boundaries, with correct datatypes and consistent counts.

Some altitude and location values are shared, which makes sense since many tracks share similar altitudes due to proximity.

In [19]:
alt_counts = circuits['alt'].value_counts()
for alt_value in alt_counts.index:
    count = alt_counts[alt_value]
    if count > 1:
        matching_rows = circuits[circuits['alt'] == alt_value][['name', 'country']]
        for _, row in matching_rows.iterrows():
            print(f"{count} {row['name']} ({row['country']})")

3 Sepang International Circuit (Malaysia)
3 Marina Bay Street Circuit (Singapore)
3 Sebring International Raceway (USA)
2 Autodromo Enzo e Dino Ferrari (Italy)
2 Circuito de Jerez (Spain)
2 Silverstone Circuit (UK)
2 Scandinavian Raceway (Sweden)
2 Korean International Circuit (Korea)
2 Miami International Autodrome (USA)
2 Donington Park (UK)
2 Reims-Gueux (France)
2 Prince George Circuit (South Africa)
2 Jeddah Corniche Circuit (Saudi Arabia)
2 Fair Park (USA)
2 Nivelles-Baulers (Belgium)
2 Long Beach (USA)
2 Losail International Circuit (Qatar)
2 Bahrain International Circuit (Bahrain)
2 Circuit de Monaco (Monaco)
2 Istanbul Park (Turkey)
2 Autódromo do Estoril (Portugal)


### Laptimes

In [20]:
laptimes = pd.read_csv("data/lap_times.csv")
data_profiling("data/lap_times.csv")

Features:
 ['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds']
Number of rows: 575989

Feature: raceId
count    575989.000000
mean        588.321155
std         431.564765
min           1.000000
25%         137.000000
50%         856.000000
75%         994.000000
max        1132.000000
Name: raceId, dtype: float64

Feature: driverId
count    575989.000000
mean        316.035923
std         384.567928
min           1.000000
25%          16.000000
50%          44.000000
75%         821.000000
max         860.000000
Name: driverId, dtype: float64

Feature: lap
count    575989.000000
mean         30.014316
std          18.412948
min           1.000000
25%          14.000000
50%          29.000000
75%          44.000000
max          87.000000
Name: lap, dtype: float64

Feature: position
count    575989.000000
mean          9.659827
std           5.531333
min           1.000000
25%           5.000000
50%           9.000000
75%          14.000000
max          24.000000
Name: pos

In [21]:
threshold = laptimes["milliseconds"].mean() + 3 * laptimes["milliseconds"].std()  
outlier_count = (laptimes["milliseconds"] > threshold).sum()
print(f"There are {outlier_count} outliers in this dataset")

There are 681 outliers in this dataset


This dataset includes the time for each lap for each driver for every single race in history. The most meaningful features is time, both in M:S:MS and MS, which seems to be within a reasonable range, with some outliers on the high end. There are 681 outliers in this dataset, which represents 0.12% of the total count. 

Laptime information might be aggregated to compute a more meaningful driver metric (average pace, degradation), this outliers are not going to be of harm for the analysis, and some might go away after the 2014 and forward cutoff is applied. 

### Drivers

In [22]:
drivers = pd.read_csv("data/drivers.csv")
data_profiling("data/drivers.csv")

Features:
 ['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob', 'nationality', 'url']
Number of rows: 859

Feature: driverId
count    859.000000
mean     430.059371
std      248.213115
min        1.000000
25%      215.500000
50%      430.000000
75%      644.500000
max      860.000000
Name: driverId, dtype: float64

Feature: driverRef
count          859
unique         859
top       hamilton
freq             1
Name: driverRef, dtype: object

Feature: number
count     859
unique     47
top        \N
freq      802
Name: number, dtype: object

Feature: code
count     859
unique     97
top        \N
freq      757
Name: code, dtype: object

Feature: forename
count      859
unique     478
top       John
freq        14
Name: forename, dtype: object

Feature: surname
count        859
unique       800
top       Taylor
freq           5
Name: surname, dtype: object

Feature: dob
count            859
unique           841
top       1918-10-06
freq               2
Name: dob, dtype

This dataset includes driver id, reference name, code, and demographics related to the driver. Datatypes, counts and frequencies are coherent. The number and code columns have missing data in the form of “\N”, these instances will be dealt with after the 2014 cutoff is applied, if necessary. Once the mentioned cut off is applied, the number of drivers will greatly decrease. Out of all the variables, age of the driver might be one of the most meaningful features of the dataset, but apart from that, this dataset’s purpose will be to link the driver to each of their performance once aggregation is performed between the datasets, using the driverid identifier.  

### Races

In [23]:
races = pd.read_csv("data/races.csv")
data_profiling("data/races.csv")

Features:
 ['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time']
Number of rows: 1125

Feature: raceId
count    1125.000000
mean      565.710222
std       328.813817
min         1.000000
25%       282.000000
50%       563.000000
75%       845.000000
max      1144.000000
Name: raceId, dtype: float64

Feature: year
count    1125.000000
mean     1992.703111
std        20.603848
min      1950.000000
25%      1977.000000
50%      1994.000000
75%      2011.000000
max      2024.000000
Name: year, dtype: float64

Feature: round
count    1125.000000
mean        8.579556
std         5.159910
min         1.000000
25%         4.000000
50%         8.000000
75%        13.000000
max        24.000000
Name: round, dtype: float64

Feature: circuitId
count    1125.000000
mean       23.889778
std        19.633527
min         1.000000
25%         9.000000
50%  

This dataset includes raceid, year, round, circuit id, and other race specific columns. Datatypes, counts and frequencies are coherent. The date and time columns for the races have some values in the form “\N”, these instances will be dealt with after the 2014 cutoff is applied, if necessary, since it is likely that date and time was not recorded for these old races. Once the mentioned cut off is applied. Out of all the variables, raceId is a very important one since it will be used to link the races to each driver, year, round and circuit id will be useful too. The missing values in the FP1/FP2/FP3 times and dates sprint session fields the race “time” column most likely will not be included in the analysis, and their "missigness" does not represent data corruption, which is something good. URL field is a useless column.

### Constructors

In [24]:
constructors = pd.read_csv("data/constructors.csv")
data_profiling("data/constructors.csv")

Features:
 ['constructorId', 'constructorRef', 'name', 'nationality', 'url']
Number of rows: 212

Feature: constructorId
count    212.000000
mean     107.547170
std       61.952685
min        1.000000
25%       54.750000
50%      107.500000
75%      160.250000
max      215.000000
Name: constructorId, dtype: float64

Feature: constructorRef
count         212
unique        212
top       mclaren
freq            1
Name: constructorRef, dtype: object

Feature: name
count         212
unique        212
top       McLaren
freq            1
Name: name, dtype: object

Feature: nationality
count         212
unique         24
top       British
freq           86
Name: nationality, dtype: object

Feature: url
count                                                 212
unique                                                175
top       http://en.wikipedia.org/wiki/Cooper_Car_Company
freq                                                   11
Name: url, dtype: object

Missing values per column:
 constructo

The datatypes in this datasets are coherent, everything seems to be in order. The constructorID column might be the only useful variable in this dataset, this dataset can be used to understand patterns of constructors, and its main purpose will be a connective one. After the 2014 cutoff, many of the constructors will go away 

### Qualifying

In [25]:
quali = pd.read_csv("data/qualifying.csv") 
data_profiling("data/qualifying.csv")

Features:
 ['qualifyId', 'raceId', 'driverId', 'constructorId', 'number', 'position', 'q1', 'q2', 'q3']
Number of rows: 10274

Feature: qualifyId
count    10274.000000
mean      5151.583122
std       2982.227118
min          1.000000
25%       2570.250000
50%       5139.500000
75%       7728.750000
max      10331.000000
Name: qualifyId, dtype: float64

Feature: raceId
count    10274.000000
mean       613.585945
std        426.121490
min          1.000000
25%        140.000000
50%        866.000000
75%        998.000000
max       1133.000000
Name: raceId, dtype: float64

Feature: driverId
count    10274.000000
mean       334.164201
std        387.230090
min          1.000000
25%         16.000000
50%         56.000000
75%        822.000000
max        860.000000
Name: driverId, dtype: float64

Feature: constructorId
count    10274.000000
mean        46.972357
std         72.500393
min          1.000000
25%          4.000000
50%          9.000000
75%         30.000000
max        215.00000

This dataset contains all the link columns for races, constructors and drivers. The values in each column are coherent. The position column is promising since position can be derived into a driver average starting position, which is a determining factor in a race. The missing values in the form “\N” in q2 and q3 are likely from the 2006 and forward since this is the year where this type of qualifying procedure started. The position variable in laptimes and the position here overlap, so that is something that will have to be dealt with.

### Results

In [26]:
results = pd.read_csv("data/results.csv") 
data_profiling("data/results.csv")

Features:
 ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId']
Number of rows: 26539

Feature: resultId
count    26539.000000
mean     13270.944045
std       7662.590055
min          1.000000
25%       6635.500000
50%      13270.000000
75%      19904.500000
max      26544.000000
Name: resultId, dtype: float64

Feature: raceId
count    26539.000000
mean       546.818644
std        309.943887
min          1.000000
25%        298.000000
50%        527.000000
75%        804.000000
max       1133.000000
Name: raceId, dtype: float64

Feature: driverId
count    26539.000000
mean       274.718791
std        279.565408
min          1.000000
25%         57.000000
50%        170.000000
75%        386.000000
max        860.000000
Name: driverId, dtype: float64

Feature: constructorId
count    26539.000000
mean        49.8330

This dataset contains all the results for every race, it has id columns that will make it possible to link them to each constructor and driver. It includes important variables such as position, points, laps, time, fastest lap, these columns will be of importance for our analysis since they can be derived into more technical characteristics of a driver. Some columns overlap, which will be dealt with too. Missing values in the form of “\N” are present in some of the columns, the qualifying missing values are expected as explained before.

### Pit stops

In [27]:
results = pd.read_csv("data/pit_stops.csv") 
data_profiling("data/pit_stops.csv")

Features:
 ['raceId', 'driverId', 'stop', 'lap', 'time', 'duration', 'milliseconds']
Number of rows: 11030

Feature: raceId
count    11030.000000
mean       976.301995
std         89.381612
min        841.000000
25%        894.000000
50%        967.000000
75%       1056.000000
max       1133.000000
Name: raceId, dtype: float64

Feature: driverId
count    11030.000000
mean       543.368359
std        385.362455
min          1.000000
25%         20.000000
50%        817.000000
75%        832.000000
max        860.000000
Name: driverId, dtype: float64

Feature: stop
count    11030.000000
mean         1.796283
std          1.538376
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max         70.000000
Name: stop, dtype: float64

Feature: lap
count    11030.000000
mean        25.327561
std         14.904086
min          1.000000
25%         13.000000
50%         25.000000
75%         36.000000
max         78.000000
Name: lap, dtype: float64

Feature: t

This dataset contains pitstops for every race, it has id columns that will make it possible to link them to each constructor and driver. It contains the stops per race, the lap in which it was done, the duration  and no missing values are included, this dataset has some variables of interest that will be of use for our analysis. After the 2014 cutoff, pit stop strategies became more standardized due to stable regulations, this will make data more symmetrical. Time variables need cleaning.

### Every dataset of interest will now be merged into a single dataset before doing feature engineering. The results dataset will be used as the anchor dataset since it is the one that includes all the identifications.

In [41]:
for file in csv_files:
    file_path = os.path.join(data_folder, file)
    table_name = file.replace('.csv', '')  
    duckdb.register(table_name, duckdb.read_csv(file_path))
    duckdb.sql(f"SELECT * FROM {table_name} LIMIT 5").show()

┌───────────┬─────────────┬──────────────────────┬───┬──────────┬─────────┬───────┬──────────────────────┐
│ circuitId │ circuitRef  │         name         │ … │   lat    │   lng   │  alt  │         url          │
│   int64   │   varchar   │       varchar        │   │  double  │ double  │ int64 │       varchar        │
├───────────┼─────────────┼──────────────────────┼───┼──────────┼─────────┼───────┼──────────────────────┤
│         1 │ albert_park │ Albert Park Grand …  │ … │ -37.8497 │ 144.968 │    10 │ http://en.wikipedi…  │
│         2 │ sepang      │ Sepang Internation…  │ … │  2.76083 │ 101.738 │    18 │ http://en.wikipedi…  │
│         3 │ bahrain     │ Bahrain Internatio…  │ … │  26.0325 │ 50.5106 │     7 │ http://en.wikipedi…  │
│         4 │ catalunya   │ Circuit de Barcelo…  │ … │    41.57 │ 2.26111 │   109 │ http://en.wikipedi…  │
│         5 │ istanbul    │ Istanbul Park        │ … │  40.9517 │  29.405 │   130 │ http://en.wikipedi…  │
├───────────┴─────────────┴──────────

Verifying that the id counts are coherent

In [55]:
duckdb.sql("SELECT COUNT(*) - COUNT(DISTINCT raceId) FROM races;")

┌─────────────────────────────────────────┐
│ (count_star() - count(DISTINCT raceId)) │
│                  int64                  │
├─────────────────────────────────────────┤
│                                       0 │
└─────────────────────────────────────────┘

In [56]:
duckdb.sql("SELECT COUNT(*) - COUNT(DISTINCT driverId) FROM drivers;")


┌───────────────────────────────────────────┐
│ (count_star() - count(DISTINCT driverId)) │
│                   int64                   │
├───────────────────────────────────────────┤
│                                         0 │
└───────────────────────────────────────────┘

In [57]:
duckdb.sql("SELECT COUNT(*) - COUNT(DISTINCT constructorId) FROM constructors;")

┌────────────────────────────────────────────────┐
│ (count_star() - count(DISTINCT constructorId)) │
│                     int64                      │
├────────────────────────────────────────────────┤
│                                              0 │
└────────────────────────────────────────────────┘

results

      → join drivers
      
      → join constructors
      
      → join races
      
          → join circuits

      LATER
      → join pit stops
      
      → join lap times


In [68]:
duckdb.sql("DESCRIBE drivers")

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │  null   │   key   │ default │ extra │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ driverId    │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ driverRef   │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ number      │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ code        │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ forename    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ surname     │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ dob         │ DATE        │ YES     │ NULL    │ NULL    │  NULL │
│ nationality │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ url         │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘

In [69]:
duckdb.sql("DESCRIBE results")

┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│   column_name   │ column_type │  null   │   key   │ default │ extra │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ resultId        │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ raceId          │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ driverId        │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ constructorId   │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ number          │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ grid            │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ position        │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ positionText    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ positionOrder   │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ points          │ DOUBLE      │ YES     │ NULL    │ NULL    │ 

In [None]:
#driverId is the link between these two
