In [9]:
# Na bazie OpenF1Api via https://openf1.org

In [10]:
# wyznaczenie liczby sezonów - lata od tzw "ery hybrydowej" 2014-2025
def seasons_count(end_year=2025):
    start_year = 2014
    seasons = list()
    for year in range(start_year, end_year + 1):
        seasons.append(year)
    return seasons

seasons = seasons_count(2025)
print(seasons)

[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]


In [11]:
# formatowanie pól daty i czasu

from datetime import datetime

def format_datetime_fields(datetime_string):
    dt = datetime.fromisoformat(datetime_string.replace('Z', '+00:00'))
    return dt.date().isoformat(), dt.time().isoformat()

In [12]:
# pobiernie meetiings - race week 
import requests
import pandas as pd

def get_all_meetings(start_season=2014, end_season=2025):
    meetings = []
    
    for year in range(start_season, end_season + 1):
        url = f"https://api.openf1.org/v1/meetings?year={year}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for meeting in data:
                    date_start, time_start = format_datetime_fields(meeting['date_start'])
                    meetings.append({
                        "circuit_key": meeting['circuit_key'],
                        "circuit_short_name": meeting['circuit_short_name'],
                        "country_code": meeting['country_code'],
                        "country_key": meeting['country_key'],
                        "country_name": meeting['country_name'],
                        "date_start": date_start,
                        "time_start": time_start,
                        "gmt_offset": meeting['gmt_offset'],
                        "location": meeting['location'],
                        "meeting_key": meeting['meeting_key'],
                        "meeting_name": meeting['meeting_name'],
                        "meeting_official_name": meeting['meeting_official_name'],
                        "year": meeting['year'],
                    })
                print(f"Fetched {len(data)} meetings for {year}")
            else:
                print(f"No meetings found for {year}")
        else:
            print(f"Error fetching data for {year}: {response.status_code}")
    
    return pd.DataFrame(meetings)

In [13]:
all_meetings_df = get_all_meetings(2014, 2025)
all_meetings_df.head()

No meetings found for 2014
No meetings found for 2015
No meetings found for 2016
Error fetching data for 2017: 429
No meetings found for 2018
Error fetching data for 2019: 429
Error fetching data for 2020: 429
No meetings found for 2021
No meetings found for 2022
Fetched 23 meetings for 2023
Fetched 25 meetings for 2024
Fetched 18 meetings for 2025


Unnamed: 0,circuit_key,circuit_short_name,country_code,country_key,country_name,date_start,time_start,gmt_offset,location,meeting_key,meeting_name,meeting_official_name,year
0,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,Pre-Season Testing,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023
1,63,Sakhir,BRN,36,Bahrain,2023-03-03,11:30:00,03:00:00,Sakhir,1141,Bahrain Grand Prix,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2023,2023
2,149,Jeddah,KSA,153,Saudi Arabia,2023-03-17,13:30:00,03:00:00,Jeddah,1142,Saudi Arabian Grand Prix,FORMULA 1 STC SAUDI ARABIAN GRAND PRIX 2023,2023
3,10,Melbourne,AUS,5,Australia,2023-03-31,01:30:00,11:00:00,Melbourne,1143,Australian Grand Prix,FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023,2023
4,144,Baku,AZE,30,Azerbaijan,2023-04-28,09:30:00,04:00:00,Baku,1207,Azerbaijan Grand Prix,FORMULA 1 AZERBAIJAN GRAND PRIX 2023,2023


In [14]:
# pobierz kolumny z tabeli meetings
meetings_columns = all_meetings_df.columns.tolist()
print("Meetings columns:", meetings_columns)

Meetings columns: ['circuit_key', 'circuit_short_name', 'country_code', 'country_key', 'country_name', 'date_start', 'time_start', 'gmt_offset', 'location', 'meeting_key', 'meeting_name', 'meeting_official_name', 'year']


In [15]:
# unikalne wartości w kolumnie meeting_key
meetings_key_unique = all_meetings_df['meeting_key'].unique()
print(meetings_key_unique)

[1140 1141 1142 1143 1207 1208 1210 1211 1212 1213 1214 1215 1216 1217
 1218 1219 1220 1221 1222 1223 1224 1225 1226 1228 1229 1230 1231 1232
 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
 1261 1262 1263 1264 1277 1265 1266 1267 1268 1269]


In [16]:
# pobranie poszczególnych wyścigów w określonych sezonach - sessions
import requests
import pandas as pd
from time import sleep

def get_all_sessions(meetings):
    sessions = []
    
    for meeting in meetings_key_unique:
        url = f"https://api.openf1.org/v1/sessions?meeting_key={meeting}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for session in data:
                    date_start, time_start = format_datetime_fields(session['date_start'])
                    date_end, time_end = format_datetime_fields(session['date_end'])
                    
                    sessions.append({
                        'meeting_key': session['meeting_key'],
                        'year': session['year'],
                        'session_key': session['session_key'],
                        'session_type': session['session_type'],
                        'session_name': session['session_name'],
                        'date_start': date_start,
                        'time_start': time_start,
                        'date_end': date_end,
                        'time_end': time_end,
                        'gmt_offset': session['gmt_offset'],
                        'location': session['location'],
                        'country_name': session['country_name'],
                        'country_code': session['country_code'],
                        'circuit_short_name': session['circuit_short_name'],
                    })
                
                print(f"Fetched {len(data)} sessions for {meeting}")
            else:
                print(f"No sessions found for {meeting}")
        else:
            print(f"Error fetching data for {meeting}: {response.status_code}")
        sleep(1)
    return pd.DataFrame(sessions)

# Pobierz wszystkie sesje 2014–2024
all_sessions_df = get_all_sessions(meetings_key_unique)

Fetched 3 sessions for 1140
Fetched 5 sessions for 1141
Fetched 5 sessions for 1142
Fetched 5 sessions for 1143
Fetched 5 sessions for 1207
Fetched 5 sessions for 1208
Fetched 5 sessions for 1210
Fetched 5 sessions for 1211
Fetched 5 sessions for 1212
Fetched 5 sessions for 1213
Fetched 5 sessions for 1214
Fetched 5 sessions for 1215
Fetched 5 sessions for 1216
Fetched 5 sessions for 1217
Fetched 5 sessions for 1218
Fetched 5 sessions for 1219
Fetched 5 sessions for 1220
Fetched 5 sessions for 1221
Fetched 5 sessions for 1222
Fetched 5 sessions for 1223
Fetched 5 sessions for 1224
Fetched 5 sessions for 1225
Fetched 5 sessions for 1226
Fetched 3 sessions for 1228
Fetched 5 sessions for 1229
Fetched 5 sessions for 1230
Fetched 5 sessions for 1231
Fetched 5 sessions for 1232
Fetched 5 sessions for 1233
Fetched 5 sessions for 1234
Fetched 5 sessions for 1235
Fetched 5 sessions for 1236
Fetched 5 sessions for 1237
Fetched 5 sessions for 1238
Fetched 5 sessions for 1239
Fetched 5 sessions f

In [17]:
all_sessions_df.head()

Unnamed: 0,meeting_key,year,session_key,session_type,session_name,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code,circuit_short_name
0,1140,2023,9222,Practice,Practice 1,2023-02-23,07:00:00,2023-02-23,16:30:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
1,1140,2023,7763,Practice,Practice 2,2023-02-24,07:00:00,2023-02-24,16:30:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
2,1140,2023,7764,Practice,Practice 3,2023-02-25,07:00:00,2023-02-25,16:30:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
3,1141,2023,7765,Practice,Practice 1,2023-03-03,11:30:00,2023-03-03,12:30:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
4,1141,2023,7766,Practice,Practice 2,2023-03-03,15:00:00,2023-03-03,16:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir


In [18]:
all_sessions_df.tail(100)

Unnamed: 0,meeting_key,year,session_key,session_type,session_name,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code,circuit_short_name
223,1250,2024,9639,Practice,Practice 3,2024-11-23,02:30:00,2024-11-23,03:30:00,-08:00:00,Las Vegas,United States,USA,Las Vegas
224,1250,2024,9640,Qualifying,Qualifying,2024-11-23,06:00:00,2024-11-23,07:00:00,-08:00:00,Las Vegas,United States,USA,Las Vegas
225,1250,2024,9644,Race,Race,2024-11-24,06:00:00,2024-11-24,08:00:00,-08:00:00,Las Vegas,United States,USA,Las Vegas
226,1251,2024,9645,Practice,Practice 1,2024-11-29,13:30:00,2024-11-29,14:30:00,03:00:00,Lusail,Qatar,QAT,Lusail
227,1251,2024,9650,Qualifying,Sprint Qualifying,2024-11-29,17:30:00,2024-11-29,18:14:00,03:00:00,Lusail,Qatar,QAT,Lusail
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,1268,2025,9912,Race,Race,2025-09-07,13:00:00,2025-09-07,15:00:00,02:00:00,Monza,Italy,ITA,Monza
319,1269,2025,9897,Practice,Practice 1,2025-09-19,08:30:00,2025-09-19,09:30:00,04:00:00,Baku,Azerbaijan,AZE,Baku
320,1269,2025,9898,Practice,Practice 2,2025-09-19,12:00:00,2025-09-19,13:00:00,04:00:00,Baku,Azerbaijan,AZE,Baku
321,1269,2025,9899,Practice,Practice 3,2025-09-20,08:30:00,2025-09-20,09:30:00,04:00:00,Baku,Azerbaijan,AZE,Baku


In [19]:
# pobranie kolumn z tabeli sessions
sessions_columns = all_sessions_df.columns.tolist()
print("Sessions columns:", sessions_columns)

Sessions columns: ['meeting_key', 'year', 'session_key', 'session_type', 'session_name', 'date_start', 'time_start', 'date_end', 'time_end', 'gmt_offset', 'location', 'country_name', 'country_code', 'circuit_short_name']


In [20]:
# unikalne wartości w kolumnie session_key
sessions_key_unique= all_sessions_df['session_key'].unique()
print(sessions_key_unique)

[ 9222  7763  7764  7765  7766  7767  7768  7953  7772  7773  7774  7775
  7779  7780  7781  7782  7783  7787  9063  9064  9278  9069  9070  9071
  9072  9073  9074  9078  9087  9088  9089  9090  9094  9095  9096  9097
  9098  9102  9103  9104  9105  9106  9110  9111  9112  9282  9117  9118
  9119  9120  9121  9122  9126  9223  9127  9128  9129  9133  9134  9135
  9286  9140  9141  9142  9143  9144  9145  9149  9150  9151  9152  9153
  9157  9158  9159  9160  9161  9165  9166  9167  9168  9169  9173  9214
  9215  9298  9220  9221  9206  9207  9294  9212  9213  9174  9175  9176
  9177  9181  9198  9304  9308  9204  9205  9182  9183  9184  9314  9189
  9190  9191  9192  9193  9197  9462  9463  9464  9465  9466  9467  9468
  9472  9473  9474  9475  9476  9480  9481  9482  9483  9484  9488  9489
  9490  9491  9492  9496  9663  9668  9672  9664  9673  9497  9502  9506
  9498  9507  9508  9509  9510  9511  9515  9516  9517  9518  9519  9523
  9524  9525  9526  9527  9531  9532  9533  9534  9

In [21]:
# pobranie kierowcow poszczegolnych zespołów dka kazdego z meeting w sessions

def get_all_drivers(unique_meetings_keys):
    drivers = []
    for meeting_key in unique_meetings_keys:
        url = f"https://api.openf1.org/v1/drivers?meeting_key={meeting_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for driver in data:
                    drivers.append({
                        'driver_number': driver.get('driver_number'),
                        'broadcast_name': driver.get('broadcast_name'),
                        'full_name': driver.get('full_name'),
                        'first_name': driver.get('first_name'),
                        'last_name': driver.get('last_name'),
                        'name_acronym': driver.get('name_acronym'),
                        'country_code': driver.get('country_code'),
                        'team_name': driver.get('team_name'),
                        'meeting_key': meeting_key,
                        'session_key': driver.get('session_key')
                    })
                print(f"Fetched {len(data)} drivers for meeting {meeting_key}")
            else:
                print(f"No drivers found for meeting {meeting_key}")
        else:
            print(f"Error fetching data for meeting {meeting_key}: {response.status_code}")
        sleep(1)
    return pd.DataFrame(drivers)

In [22]:
drivers_df = get_all_drivers(meetings_key_unique)

Fetched 53 drivers for meeting 1140
Fetched 100 drivers for meeting 1141
Fetched 100 drivers for meeting 1142
Fetched 100 drivers for meeting 1143
Fetched 99 drivers for meeting 1207
Fetched 100 drivers for meeting 1208
Fetched 100 drivers for meeting 1210
Fetched 100 drivers for meeting 1211
Fetched 100 drivers for meeting 1212
Fetched 100 drivers for meeting 1213
Fetched 100 drivers for meeting 1214
Fetched 115 drivers for meeting 1215
Fetched 100 drivers for meeting 1216
Fetched 100 drivers for meeting 1217
Fetched 100 drivers for meeting 1218
Fetched 99 drivers for meeting 1219
Fetched 100 drivers for meeting 1220
Fetched 100 drivers for meeting 1221
Fetched 100 drivers for meeting 1222
Fetched 100 drivers for meeting 1223
Fetched 100 drivers for meeting 1224
Fetched 100 drivers for meeting 1225
Fetched 100 drivers for meeting 1226
Fetched 60 drivers for meeting 1228
Fetched 100 drivers for meeting 1229
Fetched 100 drivers for meeting 1230
Fetched 97 drivers for meeting 1231
Fetche

In [23]:
drivers_df.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code,team_name,meeting_key,session_key
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1140,7763
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1140,7763
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1140,7763
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1140,7763
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1140,7763


In [24]:
# pobranie kolumn z tabeli drivers
drivers_columns = drivers_df.columns.tolist()
print("Drivers columns:", drivers_columns)

Drivers columns: ['driver_number', 'broadcast_name', 'full_name', 'first_name', 'last_name', 'name_acronym', 'country_code', 'team_name', 'meeting_key', 'session_key']


In [25]:
unique_drivers = drivers_df['broadcast_name'].unique()

In [26]:
print(unique_drivers)

['M VERSTAPPEN' 'L SARGEANT' 'L NORRIS' 'P GASLY' 'S PEREZ' 'F ALONSO'
 'C LECLERC' 'K MAGNUSSEN' 'N DE VRIES' 'Y TSUNODA' 'G ZHOU'
 'N HULKENBERG' 'E OCON' 'L HAMILTON' 'C SAINZ' 'G RUSSELL' 'O PIASTRI'
 'A ALBON' 'F DRUGOVICH' 'V BOTTAS' 'L STROLL' 'D RICCIARDO' 'P ARON'
 'D BEGANOVIC' "Z O'SULLIVAN" 'L FORNAROLI' 'F COLAPINTO' 'M BOYA'
 'S MONTOYA' 'L BROWNING' 'R VILLAGOMEZ' 'O GRAY' 'I COHEN' 'J MARTI'
 'C MANSELL' 'T BARNARD' 'W SHIN' 'G BORTOLETO' 'O GOETHE' 'K FREDERICK'
 'G SAUCY' 'N TSOLOV' 'J EDGAR' 'G MINI' 'C COLLET' 'T SMITH' 'M ESTERSON'
 'H BARTER' 'N BEDRIN' 'A GARCIA' 'S FLOERSCH' 'R FARIA' 'R SHWARTZMAN'
 'L LAWSON' 'I HADJAR' 'F VESTI' 'O BEARMAN' 'J DOOHAN' 'T POURCHAIRE'
 "P O'WARD" 'J DENNIS' 'A IWASA' 'A ANTONELLI' 'R HIRAKAWA' 'A LECLERC'
 'K ANTONELLI' 'V MARTINS' 'A DUNNE' 'A LINDBLAD']


In [27]:
# pobranie końcowej tabeli wyników dla sesji
def get_final_session_results(session_keys, include_driver_details=True):
    all_results = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/position?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                # najnowsze na końcu
                data_sorted = sorted(data, key=lambda x: x.get('date', ''))
                
                # ostatnie pozycje dla każdego kierowcy (pozycja koncowa)
                latest_positions = {}
                for record in data_sorted:
                    driver_number = record.get('driver_number')
                    if driver_number:
                        latest_positions[driver_number] = record
                
                date, time = format_datetime_fields(data_sorted[-1]['date'])
                # wszystkie końcowe pozycje do wyników
                for driver_number, record in latest_positions.items():
                    result = {
                        'session_key': session_key,
                        'driver_number': driver_number,
                        'position': record.get('position'),
                        'date': date,
                        'time': time,
                        'meeting_key': record.get('meeting_key')
                    }
                    all_results.append(result)
                
                print(f"Session {session_key}: Pobrano {len(latest_positions)} końcowych pozycji")
            else:
                print(f"Session {session_key}: Brak danych pozycji")
        else:
            print(f"Error fetching data for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    results_df = pd.DataFrame(all_results)
    
    if not results_df.empty and include_driver_details:
        # merge z danymi o sesjach
        results_df = results_df.merge(
            all_sessions_df[['session_key', 'session_name', 'year', 'location', 'country_name', 'circuit_short_name']], 
            on='session_key', 
            how='left'
        )
        
        # merge z danymi o kierowcach używając session_key
        results_df = results_df.merge(
            drivers_df[['driver_number', 'session_key', 'broadcast_name', 'full_name', 'team_name']].drop_duplicates(), 
            on=['driver_number', 'session_key'], 
            how='left'
        )
    
    return results_df

In [28]:
results_df = get_final_session_results(sessions_key_unique, include_driver_details=True)
results_df.head()

Session 9222: Pobrano 19 końcowych pozycji
Session 7763: Pobrano 17 końcowych pozycji
Session 7764: Pobrano 17 końcowych pozycji
Session 7765: Pobrano 20 końcowych pozycji
Session 7766: Pobrano 20 końcowych pozycji
Session 7767: Pobrano 20 końcowych pozycji
Session 7768: Pobrano 20 końcowych pozycji
Session 7953: Pobrano 20 końcowych pozycji
Session 7772: Pobrano 20 końcowych pozycji
Session 7773: Pobrano 20 końcowych pozycji
Session 7774: Pobrano 20 końcowych pozycji
Session 7775: Pobrano 20 końcowych pozycji
Session 7779: Pobrano 20 końcowych pozycji
Session 7780: Pobrano 20 końcowych pozycji
Session 7781: Pobrano 20 końcowych pozycji
Session 7782: Pobrano 20 końcowych pozycji
Session 7783: Pobrano 20 końcowych pozycji
Session 7787: Pobrano 20 końcowych pozycji
Session 9063: Pobrano 20 końcowych pozycji
Session 9064: Pobrano 20 końcowych pozycji
Session 9278: Pobrano 20 końcowych pozycji
Session 9069: Pobrano 19 końcowych pozycji
Session 9070: Pobrano 20 końcowych pozycji
Session 907

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,circuit_short_name,broadcast_name,full_name,team_name
0,9222,1,1,2023-02-23,16:13:05.619000,1140,Practice 1,2023,Sakhir,Bahrain,Sakhir,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
1,9222,2,10,2023-02-23,16:13:05.619000,1140,Practice 1,2023,Sakhir,Bahrain,Sakhir,L SARGEANT,Logan SARGEANT,Williams
2,9222,4,5,2023-02-23,16:13:05.619000,1140,Practice 1,2023,Sakhir,Bahrain,Sakhir,L NORRIS,Lando NORRIS,McLaren
3,9222,10,16,2023-02-23,16:13:05.619000,1140,Practice 1,2023,Sakhir,Bahrain,Sakhir,P GASLY,Pierre GASLY,Alpine
4,9222,14,2,2023-02-23,16:13:05.619000,1140,Practice 1,2023,Sakhir,Bahrain,Sakhir,F ALONSO,Fernando ALONSO,Aston Martin


In [29]:
# posortuj wyniki według sesji i pozycji
def sort_results_by_session_and_position(results_df):
    return results_df.sort_values(by=['session_key', 'position'])

sorted_results_df = sort_results_by_session_and_position(results_df)
sorted_results_df.head()

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,circuit_short_name,broadcast_name,full_name,team_name
29,7763,24,1,2023-02-24,16:00:22.519000,1140,Practice 2,2023,Sakhir,Bahrain,Sakhir,G ZHOU,ZHOU Guanyu,Alfa Romeo
19,7763,1,2,2023-02-24,16:00:22.519000,1140,Practice 2,2023,Sakhir,Bahrain,Sakhir,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
24,7763,14,3,2023-02-24,16:00:22.519000,1140,Practice 2,2023,Sakhir,Bahrain,Sakhir,F ALONSO,Fernando ALONSO,Aston Martin
27,7763,21,4,2023-02-24,16:00:22.519000,1140,Practice 2,2023,Sakhir,Bahrain,Sakhir,N DE VRIES,Nyck DE VRIES,AlphaTauri
30,7763,27,5,2023-02-24,16:00:22.519000,1140,Practice 2,2023,Sakhir,Bahrain,Sakhir,N HULKENBERG,Nico HULKENBERG,Haas F1 Team


In [30]:
# analiza końcowych wyników sesji
def get_session_type_results(session_type, max_sessions=None):
    
    sessions = all_sessions_df[all_sessions_df['session_name'] == session_type]
    
    if max_sessions:
        sessions = sessions.head(max_sessions)
    
    session_keys = sessions['session_key'].tolist()
    
    print(f"Pobieranie wyników dla {len(session_keys)} sesji typu '{session_type}'")
    
    return get_final_session_results(session_keys)


In [31]:
# pobranie rankingu dla wszystkich dostępnych sesji każdego typu

def get_all_rankings_by_session_type():

    session_types = all_sessions_df['session_name'].unique()
    all_rankings = {}
    
    
    for session_type in session_types:
        print(f"Przetwarzanie: {session_type}")
        
        sessions_of_type = all_sessions_df[all_sessions_df['session_name'] == session_type]
        session_keys = sessions_of_type['session_key'].tolist()
        print(f"   Znaleziono {len(session_keys)} sesji typu '{session_type}'")
        
        rankings = get_final_session_results(session_keys, include_driver_details=True)
        
        if not rankings.empty:
            rankings_sorted = rankings.sort_values(['session_key', 'position'])
            all_rankings[session_type] = rankings_sorted
            
            print(f"Pobrano {len(rankings)} wyników z {rankings['session_key'].nunique()} sesji")
            print(f"Lata: {rankings['year'].min()}-{rankings['year'].max()}")
            print(f"Pozycje: {rankings['position'].min()}-{rankings['position'].max()}")
        else:
            print(f"Brak wyników dla {session_type}")
        
        print()
    
    return all_rankings

In [32]:
# zwraca ramkę danych dla każdego meeting ze wszystkimi sesjami i końcowymi pozycjami kierowców
def get_meeting_all_sessions_positions():

    all_session_keys = all_sessions_df['session_key'].tolist()
    
    all_results = get_final_session_results(all_session_keys, include_driver_details=True)
    
    meeting_sessions = all_meetings_df.merge(
        all_sessions_df[['meeting_key', 'session_key', 'session_name', 'session_type']], 
        on='meeting_key', 
        how='inner'
    )
    
    # merge wynikami wszystkich sesji
    final_data = meeting_sessions.merge(
        all_results[['session_key', 'driver_number', 'position', 'broadcast_name', 'full_name', 'team_name']], 
        on='session_key', 
        how='inner'  # inner join żeby mieć tylko te z pozycjami
    )
    
    return final_data

In [33]:
meeting_all_sessions_positions = get_meeting_all_sessions_positions()
meeting_all_sessions_positions.head()

Session 9222: Pobrano 19 końcowych pozycji
Session 7763: Pobrano 17 końcowych pozycji
Session 7764: Pobrano 17 końcowych pozycji
Session 7765: Pobrano 20 końcowych pozycji
Session 7766: Pobrano 20 końcowych pozycji
Session 7767: Pobrano 20 końcowych pozycji
Session 7768: Pobrano 20 końcowych pozycji
Session 7953: Pobrano 20 końcowych pozycji
Session 7772: Pobrano 20 końcowych pozycji
Session 7773: Pobrano 20 końcowych pozycji
Session 7774: Pobrano 20 końcowych pozycji
Session 7775: Pobrano 20 końcowych pozycji
Session 7779: Pobrano 20 końcowych pozycji
Session 7780: Pobrano 20 końcowych pozycji
Session 7781: Pobrano 20 końcowych pozycji
Session 7782: Pobrano 20 końcowych pozycji
Session 7783: Pobrano 20 końcowych pozycji
Session 7787: Pobrano 20 końcowych pozycji
Session 9063: Pobrano 20 końcowych pozycji
Session 9064: Pobrano 20 końcowych pozycji
Session 9278: Pobrano 20 końcowych pozycji
Session 9069: Pobrano 19 końcowych pozycji
Session 9070: Pobrano 20 końcowych pozycji
Session 907

Unnamed: 0,circuit_key,circuit_short_name,country_code,country_key,country_name,date_start,time_start,gmt_offset,location,meeting_key,...,meeting_official_name,year,session_key,session_name,session_type,driver_number,position,broadcast_name,full_name,team_name
0,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,...,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023,9222,Practice 1,Practice,1,1,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
1,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,...,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023,9222,Practice 1,Practice,2,10,L SARGEANT,Logan SARGEANT,Williams
2,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,...,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023,9222,Practice 1,Practice,4,5,L NORRIS,Lando NORRIS,McLaren
3,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,...,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023,9222,Practice 1,Practice,10,16,P GASLY,Pierre GASLY,Alpine
4,63,Sakhir,BRN,36,Bahrain,2023-02-23,07:00:00,03:00:00,Sakhir,1140,...,FORMULA 1 ARAMCO PRE-SEASON TESTING 2023,2023,9222,Practice 1,Practice,14,2,F ALONSO,Fernando ALONSO,Aston Martin


In [34]:
# pobierz początkową pozycję dla każdego z kierowców w meetings dla każdej sesji
def get_starting_session_position(session_keys, include_driver_details=True):
    all_results = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/position?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                # najstarsze na początku
                data_sorted = sorted(data, key=lambda x: x.get('date', ''))
                
                # pierwsze pozycje dla każdego kierowcy
                first_positions = {}
                for record in data_sorted:
                    driver_number = record.get('driver_number')
                    if driver_number and driver_number not in first_positions:
                        first_positions[driver_number] = record
                
                date, time = format_datetime_fields(data_sorted[0]['date'])
                for driver_number, record in first_positions.items():
                    result = {
                        'session_key': session_key,
                        'driver_number': driver_number,
                        'position': record.get('position'),
                        'date': date,
                        'time': time,
                        'meeting_key': record.get('meeting_key')
                    }
                    all_results.append(result)
                
                print(f"Session {session_key}: Pobrano {len(first_positions)} początkowych pozycji")
            else:
                print(f"Session {session_key}: Brak danych pozycji")
        else:
            print(f"Error fetching data for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    results_df = pd.DataFrame(all_results)
    
    if not results_df.empty and include_driver_details:
        # merge z danymi o sesjach
        results_df = results_df.merge(
            all_sessions_df[['session_key', 'session_name', 'year', 'location', 'country_name']], 
            on='session_key', 
            how='left'
        )

        # merge z danymi o kierowcach używając session_key
        results_df = results_df.merge(
            drivers_df[['driver_number', 'session_key', 'broadcast_name', 'full_name', 'team_name']].drop_duplicates(), 
            on=['driver_number', 'session_key'], 
            how='left'
        )
    
    return results_df


In [35]:
starting_position_df = get_starting_session_position(sessions_key_unique, include_driver_details=True)
starting_position_df.head()


Session 9222: Pobrano 19 początkowych pozycji
Session 7763: Pobrano 17 początkowych pozycji
Session 7764: Pobrano 17 początkowych pozycji
Session 7765: Pobrano 20 początkowych pozycji
Session 7766: Pobrano 20 początkowych pozycji
Session 7767: Pobrano 20 początkowych pozycji
Session 7768: Pobrano 20 początkowych pozycji
Session 7953: Pobrano 20 początkowych pozycji
Session 7772: Pobrano 20 początkowych pozycji
Session 7773: Pobrano 20 początkowych pozycji
Session 7774: Pobrano 20 początkowych pozycji
Session 7775: Pobrano 20 początkowych pozycji
Session 7779: Pobrano 20 początkowych pozycji
Session 7780: Pobrano 20 początkowych pozycji
Session 7781: Pobrano 20 początkowych pozycji
Session 7782: Pobrano 20 początkowych pozycji
Session 7783: Pobrano 20 początkowych pozycji
Session 7787: Pobrano 20 początkowych pozycji
Session 9063: Pobrano 20 początkowych pozycji
Session 9064: Pobrano 20 początkowych pozycji
Session 9278: Pobrano 20 początkowych pozycji
Session 9069: Pobrano 19 początkow

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,broadcast_name,full_name,team_name
0,9222,1,1,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
1,9222,2,2,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,L SARGEANT,Logan SARGEANT,Williams
2,9222,4,3,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,L NORRIS,Lando NORRIS,McLaren
3,9222,10,4,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,P GASLY,Pierre GASLY,Alpine
4,9222,14,5,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,F ALONSO,Fernando ALONSO,Aston Martin


In [36]:
# dla kazdego meeting w session pobierz pit stop dla kazdego kierowcy gdzie pit duration jest rozny od null
def get_pit_stops_for_sessions(session_key):
    pit_stops = []
    
    for session_key in session_key:
        url = f"https://api.openf1.org/v1/pit?session_key={session_key}&pit_duration>0"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            pit_date, pit_time = format_datetime_fields(data[0]['date']) if data else (None, None)

            if isinstance(data, list) and len(data) > 0:
                for pit in data:
                    pit_stops.append({
                        'meeting_key': pit.get('meeting_key'),
                        'session_key': session_key,
                        'driver_number': pit.get('driver_number'),
                        'lap': pit.get('lap'),
                        'pit_duration': pit.get('pit_duration'),
                        'date': pit_date,
                        'time': pit_time,
                    })
                print(f"Fetched {len(data)} pit stops for session {session_key}")
            else:
                print(f"No pit stops found for session {session_key}")
        else:
            print(f"Error fetching data for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(pit_stops)


In [37]:
pit_stops_df = get_pit_stops_for_sessions(sessions_key_unique)
pit_stops_df.head()

No pit stops found for session 9222
No pit stops found for session 7763
No pit stops found for session 7764
No pit stops found for session 7765
No pit stops found for session 7766
No pit stops found for session 7767
No pit stops found for session 7768
No pit stops found for session 7953
No pit stops found for session 7772
No pit stops found for session 7773
No pit stops found for session 7774
No pit stops found for session 7775
No pit stops found for session 7779
No pit stops found for session 7780
No pit stops found for session 7781
No pit stops found for session 7782
No pit stops found for session 7783
No pit stops found for session 7787
No pit stops found for session 9063
No pit stops found for session 9064
No pit stops found for session 9278
No pit stops found for session 9069
No pit stops found for session 9070
No pit stops found for session 9071
No pit stops found for session 9072
No pit stops found for session 9073
No pit stops found for session 9074
No pit stops found for sessi

Unnamed: 0,meeting_key,session_key,driver_number,lap,pit_duration,date,time
0,1211,9095,14,,35.3,2023-06-02,11:33:46.930000
1,1211,9095,18,,40.8,2023-06-02,11:33:46.930000
2,1211,9095,21,,155.2,2023-06-02,11:33:46.930000
3,1211,9095,2,,277.0,2023-06-02,11:33:46.930000
4,1211,9095,16,,27.6,2023-06-02,11:33:46.930000


In [38]:
# agregacja pit stops - liczba pit stopów i średnia długość pit stopu dla każdego kierowcy w każdej sesji
pit_stops_agg = (
    pit_stops_df
    .groupby(['session_key', 'driver_number'])
    .agg(
        pit_stop_count=('pit_duration', 'count'),
        avg_pit_duration=('pit_duration', 'mean')
    )
    .reset_index()
)

In [39]:
pit_stops_agg.head()

Unnamed: 0,session_key,driver_number,pit_stop_count,avg_pit_duration
0,9095,1,4,239.7
1,9095,2,4,278.725
2,9095,4,4,381.075
3,9095,10,3,452.4
4,9095,11,4,226.825


In [40]:
# https://api.openf1.org/v1/race_control
# get all race control for all session in meetings

def get_race_control(session_keys):
    """
    Pobiera wszystkie dane race control dla określonych sesji
    """
    all_race_control = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/race_control?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for control in data:
                    date, time = format_datetime_fields(control['date'])
                    
                    race_control_record = {
                        'session_key': session_key,
                        'date': date,
                        'time': time,
                        'category': control.get('category'),
                        'flag': control.get('flag'),
                        'lap_number': control.get('lap_number'),
                        'message': control.get('message')
                    }
                    all_race_control.append(race_control_record)
                
                print(f"Session {session_key}: Pobrano {len(data)} wiadomości race control")
            else:
                print(f"Session {session_key}: Brak danych race control")
        else:
            print(f"Error fetching race control for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(all_race_control)

In [41]:
race_controlls_df = get_race_control(sessions_key_unique)
race_controlls_df.head()

Session 9222: Pobrano 82 wiadomości race control
Session 7763: Pobrano 36 wiadomości race control
Session 7764: Pobrano 46 wiadomości race control
Session 7765: Pobrano 17 wiadomości race control
Session 7766: Pobrano 15 wiadomości race control
Session 7767: Pobrano 5 wiadomości race control
Session 7768: Pobrano 21 wiadomości race control
Session 7953: Pobrano 73 wiadomości race control
Session 7772: Pobrano 25 wiadomości race control
Session 7773: Pobrano 20 wiadomości race control
Session 7774: Pobrano 16 wiadomości race control
Session 7775: Pobrano 42 wiadomości race control
Session 7779: Pobrano 56 wiadomości race control
Session 7780: Pobrano 37 wiadomości race control
Session 7781: Pobrano 66 wiadomości race control
Session 7782: Pobrano 45 wiadomości race control
Session 7783: Pobrano 64 wiadomości race control
Session 7787: Pobrano 103 wiadomości race control
Session 9063: Pobrano 82 wiadomości race control
Session 9064: Pobrano 61 wiadomości race control
Session 9278: Pobran

Unnamed: 0,session_key,date,time,category,flag,lap_number,message
0,9222,2023-02-23,07:00:00,Flag,GREEN,,GREEN LIGHT - PIT EXIT OPEN
1,9222,2023-02-23,07:08:43,Flag,RED,,RED FLAG
2,9222,2023-02-23,07:25:11,Other,,,SESSION WILL RESUME AT 10:30
3,9222,2023-02-23,07:25:35,Flag,CLEAR,,TRACK CLEAR
4,9222,2023-02-23,07:25:39,Drs,,,DRS ENABLED


In [None]:
# # pobierz rodzaje opon 
# # https://api.openf1.org/v1/stints?session_key={session_key}
# #     "compound": "SOFT",
# #     "driver_number": 16,
# #     "lap_end": 20,
# #     "lap_start": 1,
# #     "meeting_key": 1219,
# #     "session_key": 9165,
# #     "stint_number": 1,
# #     "tyre_age_at_start": 3

# def get_tyre_stints(session_keys):
#     all_tyres = []
    
#     for session_key in session_keys:
#         url = f"https://api.openf1.org/v1/stints?session_key={session_key}"
#         response = requests.get(url)
        
#         if response.status_code == 200:
#             data = response.json()
            
#             if isinstance(data, list) and len(data) > 0:
#                 for stint in data:
#                     tyre_record = {
#                         'session_key': session_key,
#                         'driver_number': stint.get('driver_number'),
#                         'stint_number': stint.get('stint_number'),
#                         'compound': stint.get('compound'),
#                         'lap_start': stint.get('lap_start'),
#                         'lap_end': stint.get('lap_end'),
#                         'tyre_age_at_start': stint.get('tyre_age_at_start'),
#                         'meeting_key': stint.get('meeting_key')
#                     }
#                     all_tyres.append(tyre_record)
                
#                 print(f"Session {session_key}: Pobrano {len(data)} stints")
#             else:
#                 print(f"Session {session_key}: Brak danych o oponach")
#         else:
#             print(f"Error fetching tyre stints for session {session_key}: {response.status_code}")
        
#         sleep(0.5)
    
#     return pd.DataFrame(all_tyres)

In [None]:
# stints_tyre = get_tyre_stints(sessions_key_unique)
# stints_tyre.head()

Session 9222: Pobrano 267 stints
Session 7763: Pobrano 234 stints
Session 7764: Pobrano 271 stints
Session 7765: Pobrano 80 stints
Session 7766: Pobrano 81 stints
Session 7767: Pobrano 81 stints
Session 7768: Pobrano 93 stints
Session 7953: Pobrano 70 stints
Session 7772: Pobrano 79 stints
Session 7773: Pobrano 72 stints
Session 7774: Pobrano 71 stints
Session 7775: Pobrano 80 stints
Session 7779: Pobrano 44 stints
Session 7780: Pobrano 73 stints
Session 7781: Pobrano 68 stints
Session 7782: Pobrano 79 stints
Session 7783: Pobrano 82 stints
Session 7787: Pobrano 85 stints
Session 9063: Pobrano 57 stints
Session 9064: Pobrano 101 stints
Session 9278: Pobrano 43 stints
Session 9069: Pobrano 23 stints
Session 9070: Pobrano 43 stints
Session 9071: Pobrano 70 stints
Session 9072: Pobrano 95 stints
Session 9073: Pobrano 90 stints
Session 9074: Pobrano 81 stints
Session 9078: Pobrano 40 stints
Session 9087: Pobrano 68 stints
Session 9088: Pobrano 79 stints
Session 9089: Pobrano 63 stints
Sess

Unnamed: 0,session_key,driver_number,stint_number,compound,lap_start,lap_end,tyre_age_at_start,meeting_key
0,9222,63,1,MEDIUM,1.0,6.0,0.0,1140
1,9222,24,1,MEDIUM,1.0,3.0,0.0,1140
2,9222,23,1,SOFT,1.0,2.0,0.0,1140
3,9222,55,1,MEDIUM,1.0,4.0,0.0,1140
4,9222,22,1,MEDIUM,1.0,3.0,0.0,1140


In [44]:
# pobierz dane pogodowe dla każdej sesji
# https://api.openf1.org/v1/weather
    # "air_temperature": 27.8,
    # "date": "2023-05-07T18:42:25.233000+00:00",
    # "humidity": 58,
    # "meeting_key": 1208,
    # "pressure": 1018.7,
    # "rainfall": 0,
    # "session_key": 9078,
    # "track_temperature": 52.5,
    # "wind_direction": 136,
    # "wind_speed": 2.4

def get_weather_data(session_keys):
    all_weather = []
    
    for session_key in session_keys:
        url = f"https://api.openf1.org/v1/weather?session_key={session_key}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            
            if isinstance(data, list) and len(data) > 0:
                for weather in data:
                    date_w, time_w = format_datetime_fields(weather['date'])
                    weather_record = {
                        'session_key': session_key,
                        'meeting_key': weather.get('meeting_key'),
                        'date': date_w,
                        'time': time_w,
                        'air_temperature': weather.get('air_temperature'),
                        'track_temperature': weather.get('track_temperature'),
                        'humidity': weather.get('humidity'),
                        'pressure': weather.get('pressure'),
                        'rainfall': weather.get('rainfall'),
                        'wind_direction': weather.get('wind_direction'),
                        'wind_speed': weather.get('wind_speed')
                    }
                    all_weather.append(weather_record)
                
                print(f"Session {session_key}: Pobrano {len(data)} rekordów pogody")
            else:
                print(f"Session {session_key}: Brak danych o pogodzie")
        else:
            print(f"Error fetching weather data for session {session_key}: {response.status_code}")
        
        sleep(0.5)
    
    return pd.DataFrame(all_weather)

In [45]:
weather_session_df = get_weather_data(sessions_key_unique)
weather_session_df.head()

Session 9222: Pobrano 590 rekordów pogody
Session 7763: Pobrano 590 rekordów pogody
Session 7764: Pobrano 581 rekordów pogody
Session 7765: Pobrano 84 rekordów pogody
Session 7766: Pobrano 85 rekordów pogody
Session 7767: Pobrano 86 rekordów pogody
Session 7768: Pobrano 95 rekordów pogody
Session 7953: Pobrano 161 rekordów pogody
Session 7772: Pobrano 82 rekordów pogody
Session 7773: Pobrano 83 rekordów pogody
Session 7774: Pobrano 82 rekordów pogody
Session 7775: Pobrano 80 rekordów pogody
Session 7779: Pobrano 148 rekordów pogody
Session 7780: Pobrano 62 rekordów pogody
Session 7781: Pobrano 82 rekordów pogody
Session 7782: Pobrano 84 rekordów pogody
Session 7783: Pobrano 95 rekordów pogody
Session 7787: Pobrano 222 rekordów pogody
Session 9063: Pobrano 83 rekordów pogody
Session 9064: Pobrano 106 rekordów pogody
Session 9278: Pobrano 68 rekordów pogody
Session 9069: Pobrano 82 rekordów pogody
Session 9070: Pobrano 160 rekordów pogody
Session 9071: Pobrano 81 rekordów pogody
Session 

Unnamed: 0,session_key,meeting_key,date,time,air_temperature,track_temperature,humidity,pressure,rainfall,wind_direction,wind_speed
0,9222,1140,2023-02-23,06:54:55.756000,23.8,29.7,26.0,1012.2,0,193,4.9
1,9222,1140,2023-02-23,06:55:55.755000,23.8,29.7,26.0,1012.2,0,193,4.9
2,9222,1140,2023-02-23,06:56:55.754000,23.8,30.0,27.0,1012.2,0,193,4.3
3,9222,1140,2023-02-23,06:57:55.754000,23.8,30.1,27.0,1012.2,0,186,5.5
4,9222,1140,2023-02-23,06:58:55.753000,23.8,30.1,27.0,1012.2,0,182,4.4


In [46]:
# grupowanie danych pogodowych według sesji i obliczenie statystyk

def summarize_weather(group):
    return pd.Series({
        'air_temp_mean': group['air_temperature'].mean(),
        'air_temp_min': group['air_temperature'].min(),
        'air_temp_max': group['air_temperature'].max(),
        'track_temp_mean': group['track_temperature'].mean(),
        'humidity_mean': group['humidity'].mean(),
        'rainfall_total': group['rainfall'].sum(),
        'rainfall_max': group['rainfall'].max(),
        'wind_speed_mean': group['wind_speed'].mean(),
        'pressure_mean': group['pressure'].mean(),
        'weather_measurements': len(group)
    })


In [47]:
gruouped_weather = weather_session_df.groupby('session_key')
gruouped_weather_summary = gruouped_weather.apply(summarize_weather).reset_index()

gruouped_weather_summary.head()

  gruouped_weather_summary = gruouped_weather.apply(summarize_weather).reset_index()


Unnamed: 0,session_key,air_temp_mean,air_temp_min,air_temp_max,track_temp_mean,humidity_mean,rainfall_total,rainfall_max,wind_speed_mean,pressure_mean,weather_measurements
0,7763,26.835254,22.9,31.5,36.444915,26.076271,0.0,0.0,2.091695,1009.909661,590.0
1,7764,26.641997,24.0,29.2,36.384682,42.820998,0.0,0.0,1.154217,1013.120654,581.0
2,7765,26.989286,26.3,27.6,40.482143,12.261905,0.0,0.0,2.579762,1017.314286,84.0
3,7766,22.872941,22.1,23.6,26.972941,23.317647,0.0,0.0,1.082353,1018.512941,85.0
4,7767,28.0,27.6,28.5,40.57093,19.895349,0.0,0.0,1.777907,1017.043023,86.0


In [48]:
# z sessions tylko sesje typu Race
all_race_df = all_sessions_df[all_sessions_df['session_type'] == 'Race']
all_race_df.head()



Unnamed: 0,meeting_key,year,session_key,session_type,session_name,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code,circuit_short_name
7,1141,2023,7953,Race,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
12,1142,2023,7779,Race,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,Jeddah,Saudi Arabia,KSA,Jeddah
17,1143,2023,7787,Race,Race,2023-04-02,05:00:00,2023-04-02,07:00:00,10:00:00,Melbourne,Australia,AUS,Melbourne
21,1207,2023,9069,Race,Sprint,2023-04-29,13:30:00,2023-04-29,14:00:00,04:00:00,Baku,Azerbaijan,AZE,Baku
22,1207,2023,9070,Race,Race,2023-04-30,11:00:00,2023-04-30,13:00:00,04:00:00,Baku,Azerbaijan,AZE,Baku


In [49]:
# z sessions tylko sesje typu Qualifying
all_qualifying_df = all_sessions_df[all_sessions_df['session_name'] == 'Qualifying']
all_qualifying_df.head()

Unnamed: 0,meeting_key,year,session_key,session_type,session_name,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code,circuit_short_name
6,1141,2023,7768,Qualifying,Qualifying,2023-03-04,15:00:00,2023-03-04,16:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
11,1142,2023,7775,Qualifying,Qualifying,2023-03-18,17:00:00,2023-03-18,18:00:00,03:00:00,Jeddah,Saudi Arabia,KSA,Jeddah
16,1143,2023,7783,Qualifying,Qualifying,2023-04-01,05:00:00,2023-04-01,06:00:00,11:00:00,Melbourne,Australia,AUS,Melbourne
19,1207,2023,9064,Qualifying,Qualifying,2023-04-28,13:00:00,2023-04-28,14:00:00,04:00:00,Baku,Azerbaijan,AZE,Baku
26,1208,2023,9074,Qualifying,Qualifying,2023-05-06,20:00:00,2023-05-06,21:00:00,-04:00:00,Miami,United States,USA,Miami


In [50]:
# polaczenie wyscigow i kierowcow
df_driver = drivers_df
df_gp = all_meetings_df
df_races = all_race_df

races_session_keys = df_races[['session_key']].drop_duplicates()
df_driver_race = df_driver.merge(races_session_keys, on='session_key', how='inner')

print(df_driver_race['session_key'].unique())
print(df_races['session_key'].unique())

# polaczenie driver z race
df_merged = df_driver_race.merge(df_races, on=['meeting_key', 'session_key'], how='left')
df_merged.head()

[ 7953  7779  7787  9069  9070  9078  9094  9102  9110  9117  9118  9126
  9133  9140  9141  9149  9157  9165  9173  9220  9221  9212  9213  9181
  9204  9205  9189  9197  9472  9480  9488  9496  9672  9673  9506  9507
  9515  9523  9531  9539  9549  9550  9558  9566  9574  9582  9590  9598
  9606  9616  9617  9625  9635  9636  9644  9654  9655  9662  9693  9993
  9998 10006 10014 10022 10028 10033  9987  9979  9971  9963  9955  9947
  9934  9939  9928  9920  9912]
[ 7953  7779  7787  9069  9070  9078  9094  9102  9110  9117  9118  9126
  9133  9140  9141  9149  9157  9165  9173  9220  9221  9212  9213  9181
  9204  9205  9189  9197  9472  9480  9488  9496  9672  9673  9506  9507
  9515  9523  9531  9539  9549  9550  9558  9566  9574  9582  9590  9598
  9606  9616  9617  9625  9635  9636  9644  9654  9655  9662  9693  9993
  9998 10006 10014 10022 10028 10033  9987  9979  9971  9963  9955  9947
  9934  9939  9928  9920  9912]


Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,session_name,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code_y,circuit_short_name
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1141,7953,...,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1141,7953,...,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1141,7953,...,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1141,7953,...,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1141,7953,...,Race,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir


In [51]:
results_qualifying = get_final_session_results(all_qualifying_df['session_key'].unique(), include_driver_details=True)
results_qualifying.head()

Session 7768: Pobrano 20 końcowych pozycji
Session 7775: Pobrano 20 końcowych pozycji
Session 7783: Pobrano 20 końcowych pozycji
Session 9064: Pobrano 20 końcowych pozycji
Session 9074: Pobrano 20 końcowych pozycji
Session 9090: Pobrano 20 końcowych pozycji
Session 9098: Pobrano 20 końcowych pozycji
Session 9106: Pobrano 20 końcowych pozycji
Session 9112: Pobrano 20 końcowych pozycji
Session 9122: Pobrano 20 końcowych pozycji
Session 9129: Pobrano 20 końcowych pozycji
Session 9135: Pobrano 20 końcowych pozycji
Session 9145: Pobrano 20 końcowych pozycji
Session 9153: Pobrano 20 końcowych pozycji
Session 9161: Pobrano 20 końcowych pozycji
Session 9169: Pobrano 20 końcowych pozycji
Session 9215: Pobrano 20 końcowych pozycji
Session 9207: Pobrano 20 końcowych pozycji
Session 9177: Pobrano 20 końcowych pozycji
Session 9304: Pobrano 20 końcowych pozycji
Session 9314: Pobrano 20 końcowych pozycji
Session 9193: Pobrano 20 końcowych pozycji
Session 9468: Pobrano 20 końcowych pozycji
Session 947

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,circuit_short_name,broadcast_name,full_name,team_name
0,7768,14,5,2023-03-04,16:09:11.002000,1141,Qualifying,2023,Sakhir,Bahrain,Sakhir,F ALONSO,Fernando ALONSO,Aston Martin
1,7768,1,1,2023-03-04,16:09:11.002000,1141,Qualifying,2023,Sakhir,Bahrain,Sakhir,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
2,7768,11,2,2023-03-04,16:09:11.002000,1141,Qualifying,2023,Sakhir,Bahrain,Sakhir,S PEREZ,Sergio PEREZ,Red Bull Racing
3,7768,44,7,2023-03-04,16:09:11.002000,1141,Qualifying,2023,Sakhir,Bahrain,Sakhir,L HAMILTON,Lewis HAMILTON,Mercedes
4,7768,16,3,2023-03-04,16:09:11.002000,1141,Qualifying,2023,Sakhir,Bahrain,Sakhir,C LECLERC,Charles LECLERC,Ferrari


In [52]:
# dodanie do df_merged wyników kwalifikacji z zachowaniem wszytskich kolumn df_merged a dodanie tylko quali_position
df_final = []

df_final = df_merged.merge(
    results_qualifying[['meeting_key', 'driver_number', 'position']],
    on=['meeting_key', 'driver_number'],
    how='left'
).rename(columns={'position': 'qualifying_position'})

df_final.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,date_start,time_start,date_end,time_end,gmt_offset,location,country_name,country_code_y,circuit_short_name,qualifying_position
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1141,7953,...,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,1
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1141,7953,...,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,16
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1141,7953,...,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,11
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1141,7953,...,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,20
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1141,7953,...,2023-03-05,15:00:00,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,2


In [53]:
# dodanie do df_final wyników wyścigu z zachowaniem wszytskich kolumn df_final
df_race_results = results_df[results_df['session_name'] == 'Race']
df_race_results.head()

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,circuit_short_name,broadcast_name,full_name,team_name
133,7953,1,1,2023-03-05,16:34:46.464000,1141,Race,2023,Sakhir,Bahrain,Sakhir,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
134,7953,11,2,2023-03-05,16:34:46.464000,1141,Race,2023,Sakhir,Bahrain,Sakhir,S PEREZ,Sergio PEREZ,Red Bull Racing
135,7953,16,19,2023-03-05,16:34:46.464000,1141,Race,2023,Sakhir,Bahrain,Sakhir,C LECLERC,Charles LECLERC,Ferrari
136,7953,55,4,2023-03-05,16:34:46.464000,1141,Race,2023,Sakhir,Bahrain,Sakhir,C SAINZ,Carlos SAINZ,Ferrari
137,7953,14,3,2023-03-05,16:34:46.464000,1141,Race,2023,Sakhir,Bahrain,Sakhir,F ALONSO,Fernando ALONSO,Aston Martin


In [54]:
# # doodanie do df_final pozycji startowych z zachowaniem wszytskich kolumn df_final a dod
print(df_final.shape)
df_starting_positions = starting_position_df[starting_position_df['session_name'] == 'Race']
df_starting_positions.head()

(1536, 23)


Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,broadcast_name,full_name,team_name
133,7953,1,1,2023-03-05,14:01:06.787000,1141,Race,2023,Sakhir,Bahrain,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
134,7953,11,2,2023-03-05,14:01:06.787000,1141,Race,2023,Sakhir,Bahrain,S PEREZ,Sergio PEREZ,Red Bull Racing
135,7953,16,3,2023-03-05,14:01:06.787000,1141,Race,2023,Sakhir,Bahrain,C LECLERC,Charles LECLERC,Ferrari
136,7953,55,4,2023-03-05,14:01:06.787000,1141,Race,2023,Sakhir,Bahrain,C SAINZ,Carlos SAINZ,Ferrari
137,7953,14,5,2023-03-05,14:01:06.787000,1141,Race,2023,Sakhir,Bahrain,F ALONSO,Fernando ALONSO,Aston Martin


In [55]:
# Dodanie pozycji startowej z wyścigu i kwalifikacji do df_final
race_starting = starting_position_df[starting_position_df['session_name'] == 'Race'][['meeting_key', 'driver_number', 'position']].rename(columns={'position': 'starting_position'})
qualifying_starting = starting_position_df[starting_position_df['session_name'] == 'Qualifying'][['meeting_key', 'driver_number', 'position']].rename(columns={'position': 'qualifying_position'})
df_final = df_final.merge(race_starting, on=['meeting_key', 'driver_number'], how='left')
df_final = df_final.merge(qualifying_starting, on=['meeting_key', 'driver_number'], how='left')
df_final.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,date_end,time_end,gmt_offset,location,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1141,7953,...,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,1,1,2
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1141,7953,...,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,16,16,18
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1141,7953,...,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,11,11,13
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1141,7953,...,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,20,20,10
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1141,7953,...,2023-03-05,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,2,2,3


In [56]:
df_final.shape

(1536, 25)

In [57]:
def remove_duplicate_columns(df, column_name):
    cols = [col for col in df.columns if col == column_name]
    if len(cols) > 1:
        first = df.columns.get_loc(column_name)
        df = df.loc[:, ~df.columns.duplicated()]
    return df

In [58]:
# dodanie do df_final wyników wyścigu z zachowaniem wszytskich kolumn df_final a dodanie tylko final_position
df_final = df_final.merge(
    df_race_results[['meeting_key', 'driver_number', 'position']],
    on=['meeting_key', 'driver_number'],
    how='left'
).rename(columns={'position': 'final_position'})

df_final = remove_duplicate_columns(df_final, 'final_position')
df_final.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,time_end,gmt_offset,location,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y,final_position
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1141,7953,...,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,1,1,2,1
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1141,7953,...,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,16,16,18,12
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1141,7953,...,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,11,11,13,17
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1141,7953,...,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,20,20,10,9
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1141,7953,...,17:00:00,03:00:00,Sakhir,Bahrain,BRN,Sakhir,2,2,3,2


In [59]:
# policz zwyciestwa kazdego kierowcy do odbywajacego sie meeting, 
# tak aby policz tylko zwyciestwa przed rozpatrywanym meeting
# jako parametr funkcja przyjmuje tylko ramke ze wszytskimi wynikami sesji

def count_wins_before_meeting(race_results):
    race_sorted = race_results.sort_values('meeting_key').reset_index(drop=True).copy()
    
    wins_count = {}
    wins_before_list = []
    
    for idx, row in race_sorted.iterrows():
        driver = row['driver_number']
        meeting = row['meeting_key']
        
        # aktualna liczba zwycięstw przed tym meetingiem
        wins_before = wins_count.get(driver, 0)
        wins_before_list.append(wins_before)
        
        # jeśli kierowca wygrał ten wyścig, zwiększ liczbę zwycięstw
        if row['final_position'] == 1:
            wins_count[driver] = wins_before + 1
    
    race_sorted['wins_before'] = wins_before_list
    
    return race_sorted


test = count_wins_before_meeting(df_final)

#wyswitl tylko dla driver_number 1
print(test[test['driver_number'] == 1][['meeting_key', 'driver_number', 'final_position', 'wins_before']])

      meeting_key  driver_number  final_position  wins_before
0            1141              1               1            0
36           1142              1               2            1
51           1143              1               1            1
79           1207              1               2            2
81           1207              1               2            2
...           ...            ...             ...          ...
1454         1265              1               4           38
1465         1266              1               9           38
1477         1267              1               2           38
1511         1268              1               1           38
1516         1277              1               5           39

[77 rows x 4 columns]


In [60]:
# merge z df_final
df_final = df_final.merge(
    test[['meeting_key', 'driver_number', 'wins_before']],
    on=['meeting_key', 'driver_number'],
    how='left'
)
df_final.head()


Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,gmt_offset,location,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y,final_position,wins_before
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1141,7953,...,03:00:00,Sakhir,Bahrain,BRN,Sakhir,1,1,2,1,0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1141,7953,...,03:00:00,Sakhir,Bahrain,BRN,Sakhir,16,16,18,12,0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1141,7953,...,03:00:00,Sakhir,Bahrain,BRN,Sakhir,11,11,13,17,0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1141,7953,...,03:00:00,Sakhir,Bahrain,BRN,Sakhir,20,20,10,9,0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1141,7953,...,03:00:00,Sakhir,Bahrain,BRN,Sakhir,2,2,3,2,0


In [61]:
# oblicz liczbe pit stopów dla każdego kierowcy w danym wyścigu i sredni czas trwania pit stopu

def calculate_pit_stops_stats(pit_stops_df):
    pit_stats = (
        pit_stops_df
        .groupby(['session_key', 'driver_number'])
        .agg(
            pit_stop_count=('pit_duration', 'count'),
            avg_pit_duration=('pit_duration', 'mean')
        )
        .reset_index()
    )
    
    return pit_stats

In [62]:
race_sessions_key_unique = df_final['session_key'].unique()
race_sessions_key_unique


array([ 7953,  7779,  7787,  9069,  9070,  9078,  9094,  9102,  9110,
        9117,  9118,  9126,  9133,  9140,  9141,  9149,  9157,  9165,
        9173,  9220,  9221,  9212,  9213,  9181,  9204,  9205,  9189,
        9197,  9472,  9480,  9488,  9496,  9672,  9673,  9506,  9507,
        9515,  9523,  9531,  9539,  9549,  9550,  9558,  9566,  9574,
        9582,  9590,  9598,  9606,  9616,  9617,  9625,  9635,  9636,
        9644,  9654,  9655,  9662,  9693,  9993,  9998, 10006, 10014,
       10022, 10028, 10033,  9987,  9979,  9971,  9963,  9955,  9947,
        9934,  9939,  9928,  9920,  9912])

In [63]:
df_race_pit_stops = pit_stops_df[pit_stops_df['session_key'].isin(race_sessions_key_unique)]


In [64]:
df_race_pit_stops_stats = calculate_pit_stops_stats(df_race_pit_stops)
df_race_pit_stops_stats.head()

Unnamed: 0,session_key,driver_number,pit_stop_count,avg_pit_duration
0,9102,1,2,22.25
1,9102,2,2,23.15
2,9102,4,3,27.5
3,9102,10,2,23.45
4,9102,11,2,21.6


In [65]:
# merge pit stop stats z df_final
df_final = df_final.merge(
    df_race_pit_stops_stats,
    on=['session_key', 'driver_number'],
    how='left'
).rename(columns={
    'pit_stop_count': 'pit_stops_count',
    'avg_pit_duration': 'avg_pit_stop_duration'
})

# sortowanie df_final po meeting_key i driver_number


In [66]:
df_final = df_final.sort_values(by=['session_key', 'driver_number']).reset_index(drop=True)
df_final.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,15,15,1,2,1,,
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,Saudi Arabia,KSA,Jeddah,20,20,16,16,0,,
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,Saudi Arabia,KSA,Jeddah,19,19,7,17,0,,
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,Saudi Arabia,KSA,Jeddah,10,9,9,9,0,,
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,1,1,2,1,0,,


In [67]:
# usunięcie zduplikowanych kolumn w df_final
df_final_clean = df_final.loc[:, ~df_final.columns.duplicated()]
print(df_final_clean[df_final_clean['pit_stops_count'] > 0][['meeting_key', 'session_key', 'driver_number', 'pit_stops_count', 'avg_pit_stop_duration']].head(10))

     meeting_key  session_key  driver_number  pit_stops_count  \
177         1211         9102              1              2.0   
178         1211         9102              2              2.0   
179         1211         9102              4              3.0   
180         1211         9102             10              2.0   
181         1211         9102             11              2.0   
182         1211         9102             14              2.0   
183         1211         9102             16              2.0   
184         1211         9102             18              2.0   
185         1211         9102             20              3.0   
186         1211         9102             21              2.0   

     avg_pit_stop_duration  
177              22.250000  
178              23.150000  
179              27.500000  
180              23.450000  
181              21.600000  
182              22.650000  
183              22.350000  
184              22.800000  
185              23.033

In [68]:
df_final.head()
df_copy = df_final.copy()

# zastap wszystkie wartosci NaN w kolumnach pit_stops_count i avg_pit_stop_duration zerami
df_copy['pit_stops_count'] = df_copy['pit_stops_count'].fillna(0).astype(int)
df_copy['avg_pit_stop_duration'] = df_copy['avg_pit_stop_duration'].fillna(0).astype(float)


In [69]:
starting_position_df.head()

Unnamed: 0,session_key,driver_number,position,date,time,meeting_key,session_name,year,location,country_name,broadcast_name,full_name,team_name
0,9222,1,1,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,M VERSTAPPEN,Max VERSTAPPEN,Red Bull Racing
1,9222,2,2,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,L SARGEANT,Logan SARGEANT,Williams
2,9222,4,3,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,L NORRIS,Lando NORRIS,McLaren
3,9222,10,4,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,P GASLY,Pierre GASLY,Alpine
4,9222,14,5,2023-02-23,06:54:49.318000,1140,Practice 1,2023,Sakhir,Bahrain,F ALONSO,Fernando ALONSO,Aston Martin


In [None]:
# save datasets to csv files
df_copy.to_csv('../../datasets/thesis_f1_race_data_all.csv', index=False)
df_sessions = all_sessions_df

df_copy.head()

Unnamed: 0,driver_number,broadcast_name,full_name,first_name,last_name,name_acronym,country_code_x,team_name,meeting_key,session_key,...,country_name,country_code_y,circuit_short_name,qualifying_position_x,starting_position,qualifying_position_y,final_position,wins_before,pit_stops_count,avg_pit_stop_duration
0,1,M VERSTAPPEN,Max VERSTAPPEN,Max,Verstappen,VER,NED,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,15,15,1,2,1,0,0.0
1,2,L SARGEANT,Logan SARGEANT,Logan,Sargeant,SAR,USA,Williams,1142,7779,...,Saudi Arabia,KSA,Jeddah,20,20,16,16,0,0,0.0
2,4,L NORRIS,Lando NORRIS,Lando,Norris,NOR,GBR,McLaren,1142,7779,...,Saudi Arabia,KSA,Jeddah,19,19,7,17,0,0,0.0
3,10,P GASLY,Pierre GASLY,Pierre,Gasly,GAS,FRA,Alpine,1142,7779,...,Saudi Arabia,KSA,Jeddah,10,9,9,9,0,0,0.0
4,11,S PEREZ,Sergio PEREZ,Sergio,Perez,PER,MEX,Red Bull Racing,1142,7779,...,Saudi Arabia,KSA,Jeddah,1,1,2,1,0,0,0.0


In [71]:
def categorize_position(pos):
    if pos == 1:
        return 'winner'
    elif pos in [2, 3]:
        return 'top3'
    elif 4 <= pos <= 10:
        return 'points'
    else:
        return 'no_points'

In [72]:
# kolumna target
df_final['position_category'] = df_final['final_position'].apply(categorize_position)

In [73]:
df_final.head()
back = df_copy.copy()

In [74]:
df_copy['position_category'] = df_copy['final_position'].apply(categorize_position)

In [75]:
features_to_drop = [ 'full_name', 'first_name', 'last_name',
                    'name_acronym', 'team_name', 'meeting_key', 'session_key', 'location', 'country_name',
                    'country_code_y', 'circuit_short_name', 'final_position', 'position_category','session_type']

#mozna usunac session tyype... bo w modelku niczego to nie zmiana bo wszytskie taka maja 
#ale bedzie potrzeba ogolem 


In [76]:
X = df_copy.drop(columns=features_to_drop)
y = df_copy['position_category']

data = X.copy()
data['position_category'] = y

data.head()

Unnamed: 0,driver_number,broadcast_name,country_code_x,year,session_name,date_start,time_start,date_end,time_end,gmt_offset,qualifying_position_x,starting_position,qualifying_position_y,wins_before,pit_stops_count,avg_pit_stop_duration,position_category
0,1,M VERSTAPPEN,NED,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,15,15,1,1,0,0.0,top3
1,2,L SARGEANT,USA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,20,20,16,0,0,0.0,no_points
2,4,L NORRIS,GBR,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,19,19,7,0,0,0.0,no_points
3,10,P GASLY,FRA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,10,9,9,0,0,0.0,points
4,11,S PEREZ,MEX,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,1,1,2,0,0,0.0,winner


In [77]:
#usuwa quali_x i y 
data['qualifying_position'] = data['qualifying_position_y']
data = data.drop(columns=['qualifying_position_x', 'qualifying_position_y'])
data.head()

Unnamed: 0,driver_number,broadcast_name,country_code_x,year,session_name,date_start,time_start,date_end,time_end,gmt_offset,starting_position,wins_before,pit_stops_count,avg_pit_stop_duration,position_category,qualifying_position
0,1,M VERSTAPPEN,NED,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,15,1,0,0.0,top3,1
1,2,L SARGEANT,USA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,20,0,0,0.0,no_points,16
2,4,L NORRIS,GBR,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,19,0,0,0.0,no_points,7
3,10,P GASLY,FRA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,9,0,0,0.0,points,9
4,11,S PEREZ,MEX,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,1,0,0,0.0,winner,2


In [78]:
#zamien kolejnosc zeby position_category byla ostatnia kolumna
cols = data.columns.tolist()
cols = [col for col in cols if col != 'position_category'] + ['position_category']
data = data[cols]
data.head()

Unnamed: 0,driver_number,broadcast_name,country_code_x,year,session_name,date_start,time_start,date_end,time_end,gmt_offset,starting_position,wins_before,pit_stops_count,avg_pit_stop_duration,qualifying_position,position_category
0,1,M VERSTAPPEN,NED,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,15,1,0,0.0,1,top3
1,2,L SARGEANT,USA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,20,0,0,0.0,16,no_points
2,4,L NORRIS,GBR,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,19,0,0,0.0,7,no_points
3,10,P GASLY,FRA,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,9,0,0,0.0,9,points
4,11,S PEREZ,MEX,2023,Race,2023-03-19,17:00:00,2023-03-19,19:00:00,03:00:00,1,0,0,0.0,2,winner


In [None]:
# do csv zapisz x oraz y do pliku f1_data.csv
data.to_csv('../../datasets/thesis_f1_data.csv', index=False)

In [80]:
# wyswitel informacje o zbiorze danych
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

#wyswietl cechy
print("Features in X:", X.columns.tolist())
# wyswietl unikalne kategorie pozycji
print("Unique position categories in y:", y.unique())


Shape of X: (2134, 16)
Shape of y: (2134,)
Features in X: ['driver_number', 'broadcast_name', 'country_code_x', 'year', 'session_name', 'date_start', 'time_start', 'date_end', 'time_end', 'gmt_offset', 'qualifying_position_x', 'starting_position', 'qualifying_position_y', 'wins_before', 'pit_stops_count', 'avg_pit_stop_duration']
Unique position categories in y: ['top3' 'no_points' 'points' 'winner']


In [81]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [None]:
!pip freeze > requirements.txt

In [84]:
import sklearn
import numpy as np
print(pd.__version__)
print(np.__version__)
print(requests.__version__)
print(sklearn.__version__)

2.3.2
2.3.3
2.32.5
1.7.1


In [None]:
# dla kazdego session w all_sessions_df policz