In [6]:
import pandas as pd
import requests
import time

### STINTS - TYRES

In [7]:
#get data from f1_race_data_all and save to variable

data = pd.read_csv('../../datasets/thesis_f1_race_data_all.csv')
data.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,15,15,1,2,1,0,0.0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,Saudi Arabia,KSA,Jeddah,20,20,16,16,0,0,0.0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,Saudi Arabia,KSA,Jeddah,19,19,7,17,0,0,0.0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,Saudi Arabia,KSA,Jeddah,10,9,9,9,0,0,0.0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,1,1,2,1,0,0,0.0


In [8]:
data.shape

(2154, 29)

In [9]:
# pobierz tylko session_keys z data i zapisz do all_sessions_df
all_sessions_df = data[['session_key']].drop_duplicates().reset_index(drop=True)
all_sessions_df.head()

Unnamed: 0,session_key
0,7779
1,7787
2,7953
3,9069
4,9070


In [10]:
# pobierz kierowcow z df_all_sessions_df i zapisz do all_drivers_df
all_drivers_df = data[['driver_number','broadcast_name']].drop_duplicates().reset_index(drop=True)
all_drivers_df.head()

Unnamed: 0,driver_number,broadcast_name
0,1,M VERSTAPPEN
1,2,L SARGEANT
2,4,L NORRIS
3,10,P GASLY
4,11,S PEREZ


In [11]:
# pobierz rodzaje opon 
# https://api.openf1.org/v1/stints?session_key={session_key}
#     "compound": "SOFT",
#     "driver_number": 16,
#     "lap_end": 20,
#     "lap_start": 1,
#     "meeting_key": 1219,
#     "session_key": 9165,
#     "stint_number": 1,
#     "tyre_age_at_start": 3

def get_tyre_stints(session_keys):
    all_tyres = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/stints?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for stint in data:
                    tyre_record = {
                        'session_key': session_key,
                        'driver_number': stint.get('driver_number'),
                        'stint_number': stint.get('stint_number'),
                        'compound': stint.get('compound'),
                        'lap_start': stint.get('lap_start'),
                        'lap_end': stint.get('lap_end'),
                        'tyre_age_at_start': stint.get('tyre_age_at_start'),
                        'meeting_key': stint.get('meeting_key')
                    }
                    all_tyres.append(tyre_record)
                
                print(f"Session {session_key}: Pobrano {len(data)} stints")
            else:
                print(f"Session {session_key}: Brak danych o oponach")
        else:
            print(f"Error fetching tyre stints for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(all_tyres)

In [12]:
# get tyres stints for all sessions
from time import sleep
tyres_stints_df = get_tyre_stints(all_sessions_df['session_key'].tolist())
tyres_stints_df.head()

Session 7779: Pobrano 44 stints
Session 7787: Pobrano 85 stints
Session 7953: Pobrano 70 stints
Session 9069: Pobrano 23 stints
Session 9070: Pobrano 43 stints
Session 9078: Pobrano 40 stints
Session 9094: Pobrano 57 stints
Session 9102: Pobrano 63 stints
Session 9110: Pobrano 53 stints
Session 9117: Pobrano 31 stints
Session 9118: Pobrano 83 stints
Session 9126: Pobrano 44 stints
Session 9133: Pobrano 56 stints
Session 9140: Pobrano 42 stints
Session 9141: Pobrano 57 stints
Session 9149: Pobrano 121 stints
Session 9157: Pobrano 45 stints
Session 9165: Pobrano 44 stints
Session 9173: Pobrano 63 stints
Session 9181: Pobrano 58 stints
Session 9189: Pobrano 51 stints
Session 9197: Pobrano 57 stints
Session 9204: Pobrano 20 stints
Session 9205: Pobrano 87 stints
Session 9212: Pobrano 20 stints
Session 9213: Pobrano 56 stints
Session 9220: Pobrano 23 stints
Session 9221: Pobrano 73 stints
Session 9472: Pobrano 63 stints
Session 9480: Pobrano 39 stints
Session 9488: Pobrano 55 stints
Session

Unnamed: 0,session_key,driver_number,stint_number,compound,lap_start,lap_end,tyre_age_at_start,meeting_key
0,7779,77,1,MEDIUM,1.0,9.0,0,1142
1,7779,63,1,MEDIUM,1.0,18.0,0,1142
2,7779,27,1,MEDIUM,1.0,11.0,0,1142
3,7779,2,1,HARD,1.0,18.0,0,1142
4,7779,23,1,MEDIUM,1.0,17.0,0,1142


In [13]:
# dla kazdego zwawodnika w kazdej sesji oblicz ile stints mial
tyres_stints_count_df = tyres_stints_df.groupby(['session_key', 'driver_number']).size().reset_index(name='num_stints')
tyres_stints_count_df.head()

Unnamed: 0,session_key,driver_number,num_stints
0,7779,1,2
1,7779,2,2
2,7779,4,3
3,7779,10,2
4,7779,11,2


In [14]:
# w jednej kolumnie dla kazdego zawodnika i kazdej sesji podaj jakie mial kolejno rodzaje opon
tyres_stints_list_df = tyres_stints_df.groupby(['session_key', 'driver_number'])['compound'].apply(list).reset_index(name='tyre_compounds')
tyres_stints_list_df.head()

Unnamed: 0,session_key,driver_number,tyre_compounds
0,7779,1,"[MEDIUM, HARD]"
1,7779,2,"[HARD, MEDIUM]"
2,7779,4,"[SOFT, HARD, MEDIUM]"
3,7779,10,"[MEDIUM, HARD]"
4,7779,11,"[MEDIUM, HARD]"


In [15]:
# Załóżmy, że masz tyres_stints_list_df z kolumną 'tyre_compounds'
for compound in ['SOFT', 'MEDIUM', 'HARD']:
    tyres_stints_list_df[f'used_{compound.lower()}'] = tyres_stints_list_df['tyre_compounds'].apply(lambda x: int(compound in x))

#drop tyre_compounds
tyres_stints_list_df = tyres_stints_list_df.drop(columns=['tyre_compounds'])
tyres_stints_list_df.head()

Unnamed: 0,session_key,driver_number,used_soft,used_medium,used_hard
0,7779,1,0,1,1
1,7779,2,0,1,1
2,7779,4,1,1,1
3,7779,10,0,1,1
4,7779,11,0,1,1


In [16]:
# zrob merge data and tyres_stints_count_df on session_key and driver_number
data_stints = data.merge(tyres_stints_count_df, on=['session_key', 'driver_number'], how='left')
data_stints = data_stints.merge(tyres_stints_list_df, on=['session_key', 'driver_number'], how='left')
data_stints.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,starting_position,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,15,1,2,1,0,0.0,2.0,0.0,1.0,1.0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,20,16,16,0,0,0.0,2.0,0.0,1.0,1.0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,19,7,17,0,0,0.0,3.0,1.0,1.0,1.0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,9,9,9,0,0,0.0,2.0,0.0,1.0,1.0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,1,2,1,0,0,0.0,2.0,0.0,1.0,1.0


In [17]:
# convert used_soft, used_medium, used_hard to int
data_stints['used_soft'] = data_stints['used_soft'].fillna(0).astype(int)
data_stints['used_medium'] = data_stints['used_medium'].fillna(0).astype(int)
data_stints['used_hard'] = data_stints['used_hard'].fillna(0).astype(int)
data_stints['num_stints'] = data_stints['num_stints'].fillna(0).astype(int)
data_stints.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,starting_position,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,15,1,2,1,0,0.0,2,0,1,1
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,20,16,16,0,0,0.0,2,0,1,1
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,19,7,17,0,0,0.0,3,1,1,1
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,9,9,9,0,0,0.0,2,0,1,1
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,1,2,1,0,0,0.0,2,0,1,1


In [18]:
#weryfikacja czy wszystko sie zgadza (21.09.2021 1635, 33)
data_stints.shape

(2154, 33)

In [19]:
# formatowanie pól daty i czasu

from datetime import datetime

def format_datetime_fields(datetime_string):
    dt = datetime.fromisoformat(datetime_string.replace('Z', '+00:00'))
    return dt.date().isoformat(), dt.time().isoformat()

### RACE CONTROLL (RF, YF, YYF, SC, PEN)

In [20]:
# https://api.openf1.org/v1/race_control
# get all race control for all session in meetings

def get_race_control(session_keys):
    """
    Pobiera wszystkie dane race control dla określonych sesji
    """
    all_race_control = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/race_control?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for control in data:
                    date, time = format_datetime_fields(control['date'])
                    
                    race_control_record = {
                        'session_key': session_key,
                        'date': date,
                        'time': time,
                        'category': control.get('category'),
                        'flag': control.get('flag'),
                        'lap_number': control.get('lap_number'),
                        'message': control.get('message')
                    }
                    all_race_control.append(race_control_record)
                
                print(f"Session {session_key}: Pobrano {len(data)} wiadomości race control")
            else:
                print(f"Session {session_key}: Brak danych race control")
        else:
            print(f"Error fetching race control for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(all_race_control)

In [21]:
race_controls = get_race_control(all_sessions_df['session_key'].tolist())
race_controls.head()

Session 7779: Pobrano 56 wiadomości race control
Session 7787: Pobrano 103 wiadomości race control
Session 7953: Pobrano 73 wiadomości race control
Session 9069: Pobrano 48 wiadomości race control
Session 9070: Pobrano 52 wiadomości race control
Session 9078: Pobrano 38 wiadomości race control
Session 9094: Pobrano 242 wiadomości race control
Session 9102: Pobrano 47 wiadomości race control
Session 9110: Pobrano 75 wiadomości race control
Session 9117: Pobrano 44 wiadomości race control
Session 9118: Pobrano 210 wiadomości race control
Session 9126: Pobrano 44 wiadomości race control
Session 9133: Pobrano 85 wiadomości race control
Session 9140: Pobrano 84 wiadomości race control
Session 9141: Pobrano 54 wiadomości race control
Session 9149: Pobrano 74 wiadomości race control
Session 9157: Pobrano 43 wiadomości race control
Session 9165: Pobrano 74 wiadomości race control
Session 9173: Pobrano 71 wiadomości race control
Session 9181: Pobrano 96 wiadomości race control
Session 9189: Pob

Unnamed: 0,session_key,date,time,category,flag,lap_number,message
0,7779,2023-03-19,16:10:31,Other,,1.0,PINK HEAD PADDING MATERIAL MUST BE USED
1,7779,2023-03-19,16:20:00,Flag,GREEN,1.0,GREEN LIGHT - PIT EXIT OPEN
2,7779,2023-03-19,16:30:00,Other,,1.0,PIT EXIT CLOSED
3,7779,2023-03-19,16:45:27,Other,,1.0,RISK OF RAIN FOR F1 RACE IS 0%
4,7779,2023-03-19,16:57:03,Drs,,1.0,DRS DISABLED


In [22]:
# save controls to csv
race_controls.to_csv('../../datasets/race_controls.csv', index=False)

In [23]:
# zliczenie dla kazdej sesji ile bylo red flags (flag == 'RED')
red_flags_count_df = (
    race_controls[race_controls['flag'] == 'RED']
    .groupby('session_key')
    .size()
    .reset_index(name='num_red_flags')
)
red_flags_count_df.head()

Unnamed: 0,session_key,num_red_flags
0,7787,4
1,9149,1
2,9181,1
3,9205,1
4,9496,1


In [24]:
data_stints.shape

(2154, 33)

In [25]:
# merge red_flags_count_df with data on session_key
data_stints_rf = data_stints.merge(red_flags_count_df, on='session_key', how='left')
data_stints_rf['num_red_flags'] = data_stints_rf['num_red_flags'].fillna(0).astype(int)
data_stints_rf.head() 

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard,num_red_flags
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,1,2,1,0,0.0,2,0,1,1,0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,16,16,0,0,0.0,2,0,1,1,0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,7,17,0,0,0.0,3,1,1,1,0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,9,9,0,0,0.0,2,0,1,1,0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,2,1,0,0,0.0,2,0,1,1,0


In [26]:
data_stints_rf.shape

(2154, 34)

In [27]:
# zliczeenie safety car Deployed (category == 'SafetyCar' and message contains deployed)
safety_car_count_df = (
    race_controls[race_controls['category'] == 'SafetyCar']
    [race_controls['message'].str.contains('deployed', case=False)]
    .groupby('session_key')
    .size()
    .reset_index(name='num_safety_car_deployed')
)
safety_car_count_df.head()

  race_controls[race_controls['category'] == 'SafetyCar']


Unnamed: 0,session_key,num_safety_car_deployed
0,7779,1
1,7787,4
2,7953,1
3,9069,2
4,9070,1


In [28]:
# merge safety_car_count_df with data on session_key
data_stints_rf_sc = data_stints_rf.merge(safety_car_count_df, on='session_key', how='left')
data_stints_rf_sc['num_safety_car_deployed'] = data_stints_rf_sc['num_safety_car_deployed'].fillna(0).astype(int)
data_stints_rf_sc.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,final_position,wins_before,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard,num_red_flags,num_safety_car_deployed
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,2,1,0,0.0,2,0,1,1,0,1
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,16,0,0,0.0,2,0,1,1,0,1
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,17,0,0,0.0,3,1,1,1,0,1
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,9,0,0,0.0,2,0,1,1,0,1
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,1,0,0,0.0,2,0,1,1,0,1


In [29]:
data_stints_rf_sc.shape

(2154, 35)

In [30]:
# zliczenie yellow flags 
yellow_flags_count_df = (
    race_controls[race_controls['flag'] == 'YELLOW']
    .groupby('session_key')
    .size()
    .reset_index(name='num_yellow_flags')
)
yellow_flags_count_df.head()

Unnamed: 0,session_key,num_yellow_flags
0,7779,3
1,7787,8
2,7953,4
3,9069,5
4,9070,7


In [31]:
# zliczenie double yellow flags
double_yellow_flags_count_df = (
    race_controls[race_controls['flag'] == 'DOUBLE YELLOW']
    .groupby('session_key')
    .size()
    .reset_index(name='num_double_yellow_flags')
)
double_yellow_flags_count_df.head()

Unnamed: 0,session_key,num_double_yellow_flags
0,7779,2
1,7787,4
2,9069,7
3,9070,5
4,9078,1


In [32]:
# merge yellow flags i double yellow flags
data_stints_rf_sc_ry = data_stints_rf_sc.merge(yellow_flags_count_df, on='session_key', how='left')
data_stints_rf_sc_ry['num_yellow_flags'] = data_stints_rf_sc_ry['num_yellow_flags'].fillna(0).astype(int)
data_stints_rf_sc_ry.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,wins_before,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard,num_red_flags,num_safety_car_deployed,num_yellow_flags
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,1,0,0.0,2,0,1,1,0,1,3
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,0,0,0.0,2,0,1,1,0,1,3
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,0,0,0.0,3,1,1,1,0,1,3
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,0,0,0.0,2,0,1,1,0,1,3
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,0,0,0.0,2,0,1,1,0,1,3


In [33]:
data_stints_rf_sc_ry.shape

(2154, 36)

In [34]:
data_stints_rf_sc_ry_ryy = data_stints_rf_sc_ry.merge(double_yellow_flags_count_df, on='session_key', how='left')
data_stints_rf_sc_ry_ryy['num_double_yellow_flags'] = data_stints_rf_sc_ry_ryy['num_double_yellow_flags'].fillna(0).astype(int)
data_stints_rf_sc_ry_ryy.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,pit_stops_count,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard,num_red_flags,num_safety_car_deployed,num_yellow_flags,num_double_yellow_flags
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,0,0.0,2,0,1,1,0,1,3,2
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,0,0.0,2,0,1,1,0,1,3,2
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,0,0.0,3,1,1,1,0,1,3,2
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,0,0.0,2,0,1,1,0,1,3,2
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,0,0.0,2,0,1,1,0,1,3,2


In [35]:
data_stints_rf_sc_ry_ryy.shape

(2154, 37)

In [36]:
# ekstrakcja kary
import re

def extract_penalty_seconds(row):
    # Szuka liczby sekund i numeru kierowcy w komunikacie
    match = re.search(r'(\d+)\s*SECOND TIME PENALTY FOR CAR (\d+)', str(row['message']).upper())
    if match:
        seconds = int(match.group(1))
        driver_number = int(match.group(2))
        return pd.Series({'driver_number': driver_number, 'penalty_seconds': seconds})
    return pd.Series({'driver_number': None, 'penalty_seconds': 0})

# Wyciągnij karne sekundy i numery kierowców z komunikatów
penalties = race_controls[race_controls['message'].str.contains('SECOND TIME PENALTY', case=False, na=False)].copy()
penalties[['driver_number', 'penalty_seconds']] = penalties.apply(extract_penalty_seconds, axis=1)

In [37]:
#zostaw tylko kolumny session_key, driver_number, penalty_seconds
penalties = penalties[['session_key', 'driver_number', 'penalty_seconds']]
penalties.head()

Unnamed: 0,session_key,driver_number,penalty_seconds
10,7779,14,5
55,7779,14,10
150,7787,55,5
182,7953,31,5
187,7953,31,10


In [38]:
# Sumuj kary przed merge!
penalties_sum = penalties.groupby(['session_key', 'driver_number'], as_index=False)['penalty_seconds'].sum()

In [39]:
# merge data with penalties on session_key and driver_number
data_stints_rf_sc_ry_ryy_pen = data_stints_rf_sc_ry_ryy.merge(penalties_sum, on=['session_key', 'driver_number'], how='left')
data_stints_rf_sc_ry_ryy_pen['penalty_seconds'] = data_stints_rf_sc_ry_ryy_pen['penalty_seconds'].fillna(0).astype(int)
data_stints_rf_sc_ry_ryy_pen.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,avg_pit_stop_duration,num_stints,used_soft,used_medium,used_hard,num_red_flags,num_safety_car_deployed,num_yellow_flags,num_double_yellow_flags,penalty_seconds
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,0.0,2,0,1,1,0,1,3,2,0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,0.0,2,0,1,1,0,1,3,2,0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,0.0,3,1,1,1,0,1,3,2,0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,0.0,2,0,1,1,0,1,3,2,0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,0.0,2,0,1,1,0,1,3,2,0


In [40]:
data_stints_rf_sc_ry_ryy_pen.shape

(2154, 38)

In [41]:
# podmianka nazwy 
data_rc = data_stints_rf_sc_ry_ryy_pen
data_rc.shape

(2154, 38)

### WEATHER

In [42]:
# pobierz dane pogodowe dla każdej sesji
# https://api.openf1.org/v1/weather
    # "air_temperature": 27.8,
    # "date": "2023-05-07T18:42:25.233000+00:00",
    # "humidity": 58,
    # "meeting_key": 1208,
    # "pressure": 1018.7,
    # "rainfall": 0,
    # "session_key": 9078,
    # "track_temperature": 52.5,
    # "wind_direction": 136,
    # "wind_speed": 2.4

def get_weather_data(session_keys):
    all_weather = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/weather?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for weather in data:
                    date_w, time_w = format_datetime_fields(weather['date'])
                    weather_record = {
                        'session_key': session_key,
                        'meeting_key': weather.get('meeting_key'),
                        'date': date_w,
                        'time': time_w,
                        'air_temperature': weather.get('air_temperature'),
                        'track_temperature': weather.get('track_temperature'),
                        'humidity': weather.get('humidity'),
                        'pressure': weather.get('pressure'),
                        'rainfall': weather.get('rainfall'),
                        'wind_direction': weather.get('wind_direction'),
                        'wind_speed': weather.get('wind_speed')
                    }
                    all_weather.append(weather_record)
                
                print(f"Session {session_key}: Pobrano {len(data)} rekordów pogody")
            else:
                print(f"Session {session_key}: Brak danych o pogodzie")
        else:
            print(f"Error fetching weather data for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(all_weather)

In [43]:
weather = get_weather_data(all_sessions_df['session_key'].tolist())
weather.head()

Session 7779: Pobrano 148 rekordów pogody
Session 7787: Pobrano 222 rekordów pogody
Session 7953: Pobrano 161 rekordów pogody
Session 9069: Pobrano 82 rekordów pogody
Session 9070: Pobrano 160 rekordów pogody
Session 9078: Pobrano 155 rekordów pogody
Session 9094: Pobrano 176 rekordów pogody
Session 9102: Pobrano 154 rekordów pogody
Session 9110: Pobrano 162 rekordów pogody
Session 9117: Pobrano 97 rekordów pogody
Session 9118: Pobrano 153 rekordów pogody
Session 9126: Pobrano 151 rekordów pogody
Session 9133: Pobrano 164 rekordów pogody
Session 9140: Pobrano 120 rekordów pogody
Session 9141: Pobrano 150 rekordów pogody
Session 9149: Pobrano 209 rekordów pogody
Session 9157: Pobrano 156 rekordów pogody
Session 9165: Pobrano 176 rekordów pogody
Session 9173: Pobrano 158 rekordów pogody
Session 9181: Pobrano 190 rekordów pogody
Session 9189: Pobrano 108 rekordów pogody
Session 9197: Pobrano 156 rekordów pogody
Session 9204: Pobrano 90 rekordów pogody
Session 9205: Pobrano 183 rekordów po

Unnamed: 0,session_key,meeting_key,date,time,air_temperature,track_temperature,humidity,pressure,rainfall,wind_direction,wind_speed
0,7779,1142,2023-03-19,16:01:59.956000,26.0,34.0,62.0,1010.5,0,315,2.7
1,7779,1142,2023-03-19,16:02:59.955000,26.0,34.1,61.0,1010.5,0,305,2.9
2,7779,1142,2023-03-19,16:03:59.954000,26.0,34.2,61.0,1010.5,0,325,2.7
3,7779,1142,2023-03-19,16:04:59.953000,26.0,34.2,61.0,1010.5,0,325,2.7
4,7779,1142,2023-03-19,16:05:59.952000,26.0,34.3,60.0,1010.5,0,322,2.7


In [44]:
# weather nie potrzeba raczej wind direction
# trzeba zagregowac dane do srednich wartosci dla kazdej sesji

# # grupowanie danych pogodowych według sesji i obliczenie statystyk
#rainflal jest true or false

def summarize_weather(group):
    return pd.Series({
        'air_temp_mean': group['air_temperature'].mean(),
        'air_temp_min': group['air_temperature'].min(),
        'air_temp_max': group['air_temperature'].max(),
        'track_temp_mean': group['track_temperature'].mean(),
        'humidity_mean': group['humidity'].mean(),
        'rainfall_max': group['rainfall'].max(),
        'wind_speed_mean': group['wind_speed'].mean(),
        'pressure_mean': group['pressure'].mean()
    })


In [45]:
sumarized_weather = weather.groupby('session_key').apply(summarize_weather).reset_index()
sumarized_weather.head()

  sumarized_weather = weather.groupby('session_key').apply(summarize_weather).reset_index()


Unnamed: 0,session_key,air_temp_mean,air_temp_min,air_temp_max,track_temp_mean,humidity_mean,rainfall_max,wind_speed_mean,pressure_mean
0,7779,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973
1,7787,17.44955,16.8,18.1,30.13964,54.157658,0.0,1.127027,1018.366667
2,7953,27.431677,26.2,29.8,31.011801,21.496894,0.0,0.68323,1016.863975
3,9069,22.20122,21.8,22.6,36.831707,58.280488,1.0,0.730488,1008.90122
4,9070,24.860625,23.9,25.6,41.21,49.225,0.0,1.083125,1008.64625


In [46]:
#merge z data_rc
data_rc_weather = data_rc.merge(sumarized_weather, on='session_key', how='left')
data_rc_weather.head()  

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,num_double_yellow_flags,penalty_seconds,air_temp_mean,air_temp_min,air_temp_max,track_temp_mean,humidity_mean,rainfall_max,wind_speed_mean,pressure_mean
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,2,0,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,2,0,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,2,0,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,2,0,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,2,0,26.091892,25.8,26.4,31.792568,57.790541,0.0,1.772297,1010.92973


In [47]:
data_rc_weather.shape

(2154, 46)

### FINAL DATA TO CSV WITH STINTS_RC, WEATHER

In [48]:
data_st_rc_we = data_rc_weather
data_st_rc_we.shape

(2154, 46)

In [49]:
data_rc_weather.to_csv('../../datasets/thesis_f1_data_st_rc_we_all.csv', index=False)

In [50]:
# konkretne kolumny tylko do modelu
# te z poprzedniego modelu (16 kolumn):
# driver_number,broadcast_name,country_code_x,year,session_name,date_start,
# time_start,date_end,time_end,gmt_offset,starting_position,wins_before,
# pit_stops_count,avg_pit_stop_duration,qualifying_position,position_category

# nowe kolumny do modelu:
# num_stints, used_soft, used_medium, used_hard, 
# num_red_flags, num_safety_car_deployed,num_yellow_flags, num_double_yellow_flags, penalty_seconds,
# air_temp_mean, track_temp_mean, humidity_mean, pressure_mean, 
# rainfall_max (true/false), wind_speed_mean, wind_direction (moze byc usuniete)

# finalne kolumny do modelu
final_columns = [
    'driver_number', 'broadcast_name', 'country_code_x', 'year', 'session_name',
    'date_start', 'time_start', 'date_end', 'time_end', 'gmt_offset',
    'starting_position', 'wins_before', 'pit_stops_count', 'avg_pit_stop_duration',
    'qualifying_position_y',
    'num_stints', 'used_soft', 'used_medium', 'used_hard',
    'num_red_flags', 'num_safety_car_deployed', 'num_yellow_flags',
    'num_double_yellow_flags', 'penalty_seconds',
    'air_temp_mean', 'track_temp_mean', 'humidity_mean', 'pressure_mean',
    'rainfall_max', 'wind_speed_mean', 'final_position'
]

#wyswietl dlugosc final_columns
len(final_columns)

31

In [51]:
#wyodrebnij z final_columns z data_st_rc_we
data_final = data_st_rc_we[final_columns]
data_final.head()

Unnamed: 0,driver_number,broadcast_name,country_code_x,year,session_name,date_start,time_start,date_end,time_end,gmt_offset,...,num_yellow_flags,num_double_yellow_flags,penalty_seconds,air_temp_mean,track_temp_mean,humidity_mean,pressure_mean,rainfall_max,wind_speed_mean,final_position
0,1,M VERSTAPPEN,NED,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,...,3,2,0,26.091892,31.792568,57.790541,1010.92973,0.0,1.772297,2
1,2,L SARGEANT,USA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,...,3,2,0,26.091892,31.792568,57.790541,1010.92973,0.0,1.772297,16
2,4,L NORRIS,GBR,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,...,3,2,0,26.091892,31.792568,57.790541,1010.92973,0.0,1.772297,17
3,10,P GASLY,FRA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,...,3,2,0,26.091892,31.792568,57.790541,1010.92973,0.0,1.772297,9
4,11,S PEREZ,MEX,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,...,3,2,0,26.091892,31.792568,57.790541,1010.92973,0.0,1.772297,1


In [None]:
# (22.09.2025 2154, 31)
data_final.shape

(2154, 31)

In [53]:
#save to csv
data_final.to_csv('../../datasets/thesis_final_model_f1_data.csv', index=False)