## F1 

### 1. Importing And Cleaning 

In [228]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "pranay13257/f1-dnf-classification",
    "f1_dnf.csv" 
)

In [229]:
# First look at data
df.head()
df.info()
# target_finish == 1 means the driver finished the race and 0 means DNF 
df.target_finish.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         10000 non-null  int64  
 1   raceId           10000 non-null  int64  
 2   year             10000 non-null  int64  
 3   round            10000 non-null  int64  
 4   grid             10000 non-null  int64  
 5   positionOrder    10000 non-null  int64  
 6   points           9029 non-null   float64
 7   laps             9022 non-null   float64
 8   milliseconds     8982 non-null   object 
 9   fastestLap       10000 non-null  object 
 10  rank             10000 non-null  object 
 11  fastestLapTime   10000 non-null  object 
 12  fastestLapSpeed  9047 non-null   object 
 13  driverRef        10000 non-null  object 
 14  surname          10000 non-null  object 
 15  forename         10000 non-null  object 
 16  dob              10000 non-null  object 
 17  nationality_x

target_finish
0    7105
1    2895
Name: count, dtype: int64

In [230]:
df[['fastestLap','fastestLapSpeed','fastestLapTime']].sample(5)

Unnamed: 0,fastestLap,fastestLapSpeed,fastestLapTime
457,\N,,\N
7838,\N,,\N
4251,\N,\N,\N
697,\N,\N,\N
259,\N,\N,\N


In [231]:
# Before converting, see what string values are hiding
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    unique_vals = df[col].unique()
    print(f"\n{col}: {len(unique_vals)} unique values")
    print(f"Sample values: {unique_vals[:10]}")
    
    # Check for suspicious strings
    suspicious = df[df[col].isin(['\\N', 'NaN', 'nan', 'NULL', 'null', '', ' '])][col].value_counts()
    if len(suspicious) > 0:
        print(f"Found suspicious values in {col}:")
        print(suspicious)


milliseconds: 2600 unique values
Sample values: [nan '\\N' '4754232' '5513145' '6523887' '5300227' '6243700' '6918000'
 '9989700' '5251638']
Found suspicious values in milliseconds:
milliseconds
\N    6375
Name: count, dtype: int64

fastestLap: 80 unique values
Sample values: ['\\N' '57' '61' '53' '64' '55' '54' '63' '46' '29']
Found suspicious values in fastestLap:
fastestLap
\N    6895
Name: count, dtype: int64

rank: 26 unique values
Sample values: ['\\N' '16' '17' '9' '3' '7' '2' '13' '1' '11']
Found suspicious values in rank:
rank
\N    6798
Name: count, dtype: int64

fastestLapTime: 2990 unique values
Sample values: ['\\N' '1:10.005' '1:16.992' '1:27.092' '1:17.472' '1:40.315' '1:17.776'
 '1:19.491' '1:21.370' '1:12.268']
Found suspicious values in fastestLapTime:
fastestLapTime
\N    6895
Name: count, dtype: int64

fastestLapSpeed: 2743 unique values
Sample values: ['\\N' '222.052' '156.031' '239.457' '202.648' nan '197.844' '215.557'
 '190.419' '214.651']
Found suspicious valu

In [232]:
# They are objects so we first need to convert them to numeric
df['points'] = pd.to_numeric(df['points'], errors='coerce')
df['laps'] = pd.to_numeric(df['laps'], errors='coerce')
df['milliseconds'] = pd.to_numeric(df['milliseconds'], errors='coerce')
df['fastestLapSpeed'] = pd.to_numeric(df['fastestLapSpeed'], errors='coerce')
df['rank'] = pd.to_numeric(df['rank'], errors='coerce')
df['fastestLap'] = pd.to_numeric(df['fastestLap'], errors='coerce')
df['fastestLapTime'] = pd.to_numeric(df['fastestLapTime'], errors='coerce')
df['fastestLapSpeed'] = pd.to_numeric(df['fastestLapSpeed'], errors='coerce')

In [233]:
# Drops columns with too many NaNs
df = df.drop(columns=['milliseconds','fastestLap','rank','fastestLapTime','fastestLapSpeed'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   resultId        10000 non-null  int64  
 1   raceId          10000 non-null  int64  
 2   year            10000 non-null  int64  
 3   round           10000 non-null  int64  
 4   grid            10000 non-null  int64  
 5   positionOrder   10000 non-null  int64  
 6   points          9029 non-null   float64
 7   laps            9022 non-null   float64
 8   driverRef       10000 non-null  object 
 9   surname         10000 non-null  object 
 10  forename        10000 non-null  object 
 11  dob             10000 non-null  object 
 12  nationality_x   10000 non-null  object 
 13  constructorRef  10000 non-null  object 
 14  name            10000 non-null  object 
 15  nationality_y   10000 non-null  object 
 16  circuitRef      10000 non-null  object 
 17  circuitId       10000 non-null  

In [234]:
# Checking NaNs
print(df.isna().sum())
print(df.points.unique(),"\n",df.laps.unique(),"\n")

resultId            0
raceId              0
year                0
round               0
grid                0
positionOrder       0
points            971
laps              978
driverRef           0
surname             0
forename            0
dob                 0
nationality_x       0
constructorRef      0
name                0
nationality_y       0
circuitRef          0
circuitId           0
name_y              0
location            0
country             0
lat                 0
lng                 0
alt                 0
date                0
target_finish       0
dtype: int64
[ 3.    0.     nan 25.    1.    4.   10.    9.   12.    5.    6.    2.
  8.   11.   18.   26.   15.   30.    8.5  13.   19.    1.5   7.    6.5
  0.5   3.5   8.14 16.    4.5  36.  ] 
 [ 77.  16.   0.  69.   9.  53.  70.  79.  81.  51.  75.  55.  nan  66.
  67.  26.  27.  32.  44.  38.  78.  58.   8.  68.  13. 100.  34.  56.
  71.   7.  64.  30.  61.  62.  52.  65.  14.  57.   1.  76.  60.  31.
  54.  12.  41.  35

In [235]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].nunique())
    print("\n")

Column: resultId
10000


Column: raceId
1125


Column: year
75


Column: round
24


Column: grid
35


Column: positionOrder
39


Column: points
29


Column: laps
138


Column: driverRef
677


Column: surname
639


Column: forename
407


Column: dob
668


Column: nationality_x
40


Column: constructorRef
174


Column: name
77


Column: nationality_y
21


Column: circuitRef
77


Column: circuitId
77


Column: name_y
174


Column: location
75


Column: country
35


Column: lat
77


Column: lng
77


Column: alt
66


Column: date
1125


Column: target_finish
2




In [242]:
# Filling NaNs in 'points' 
# Use race-specific median for all rows
df['points'] = df.groupby('raceId')['points'].transform(lambda x: x.fillna(x.median()))
# Fill any remaining NaNs with overall median
df['points'].fillna(df['points'].median(), inplace=True)

print(df['points'].isna().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['points'].fillna(df['points'].median(), inplace=True)


In [243]:
#Show what are still NaNs
print(df[df.isna().any(axis=1)])

      resultId  raceId  year  round  grid  positionOrder  points  laps  \
43        3216     171  2000     14     1              1    10.0   NaN   
399      11175     470  1982      4     5              3     4.0   NaN   
846      17154     707  1965      9     7              3     4.0   NaN   
1054     20118     837  1950      5     2              1     0.0   NaN   
1337      2976     160  2000      3     1              2     6.0   NaN   
1730      9794     419  1986     16     4              1     0.0   NaN   
2687     13584     555  1977     13     2              2     6.0   NaN   
3872      2116     120  2003     13     7              2     8.0   NaN   
3987      8679     378  1988      7     1              1     9.0   NaN   
4452     15695     635  1971      4     1              1     9.0   NaN   
5045     25366    1072  2021     21     1              1    26.0   NaN   
5108     12326     510  1980     13    22              5     2.0   NaN   
5171     11188     471  1982      5   

In [244]:
# Filling NaNs in 'laps'
df['laps'] = df.groupby('raceId')['laps'].transform(lambda x: x.fillna(x.median()))
df['laps'].fillna(df['laps'].median(), inplace=True)

print(df['laps'].isna().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['laps'].fillna(df['laps'].median(), inplace=True)


In [245]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   resultId        10000 non-null  int64  
 1   raceId          10000 non-null  int64  
 2   year            10000 non-null  int64  
 3   round           10000 non-null  int64  
 4   grid            10000 non-null  int64  
 5   positionOrder   10000 non-null  int64  
 6   points          10000 non-null  float64
 7   laps            10000 non-null  float64
 8   driverRef       10000 non-null  object 
 9   surname         10000 non-null  object 
 10  forename        10000 non-null  object 
 11  dob             10000 non-null  object 
 12  nationality_x   10000 non-null  object 
 13  constructorRef  10000 non-null  object 
 14  name            10000 non-null  object 
 15  nationality_y   10000 non-null  object 
 16  circuitRef      10000 non-null  object 
 17  circuitId       10000 non-null  

In [269]:
df.sample(5)

Unnamed: 0,resultId,raceId,year,round,grid,positionOrder,points,laps,driverRef,surname,...,circuitRef,circuitId,name_y,location,country,lat,lng,alt,date,target_finish
297,5213,259,1994,3,25,19,0.0,23.0,gachot,Gachot,...,imola,21,Pacific,Imola,Italy,44.3439,11.7167,37,1994-05-01,0
5583,24269,1013,2019,4,9,7,6.0,51.0,sainz,Sainz,...,baku,73,McLaren,Baku,Azerbaijan,40.3725,49.8533,-7,2019-04-28,1
7918,683,50,2007,15,9,7,4.8,67.0,kubica,Kubica,...,fuji,16,BMW Sauber,Oyama,Japan,35.3717,138.927,583,2007-09-30,1
9857,6657,310,1991,6,19,15,0.0,48.0,suzuki,Suzuki,...,rodriguez,32,Lola,Mexico City,Mexico,19.4042,-99.0907,2227,1991-06-16,0
4982,1011,65,2006,13,12,5,4.0,69.0,coulthard,Coulthard,...,hungaroring,11,Red Bull,Budapest,Hungary,47.5789,19.2486,264,2006-08-06,0


## Feature Engineering