## METAR

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
import time

In [2]:
metar_data = pd.read_csv('../data/metar_msg_overview.csv')

In [3]:
metar_data

Unnamed: 0,airport_identifier,issued_at,metar,metartype
0,ENDR,2021-01-01 00:20:00,ENDR 010020Z 14007KT 03/M02 Q1006=,AUTO
1,ENZV,2021-01-01 00:20:00,ENZV 010020Z 13006KT 9999 SCT026 BKN035 03/01 ...,
2,ENTC,2021-01-01 00:20:00,ENTC 010020Z 21015KT CAVOK 02/M03 Q1013 RMK WI...,
3,ENSB,2021-01-01 00:20:00,ENSB 010020Z 24004KT 9999 -SN FEW009 BKN033 M0...,
4,ENGM,2021-01-01 00:20:00,ENGM 010020Z 36006KT 9999 4900E -SN FEW009 OVC...,
...,...,...,...,...
3275708,ENLA,2023-12-31 23:50:00,ENLA 312350Z 11021KT 9999NDV BKN036/// 07/06 Q...,AUTO
3275709,ENUS,2023-12-31 23:50:00,ENUS 312350Z 10033G43KT 9999NDV BKN022/// 00/M...,AUTO
3275710,ENOL,2023-12-31 23:50:00,ENOL 312350Z 12029G41KT 9999 DRSN NSC M03/M09 ...,
3275711,ENWV,2023-12-31 23:50:00,ENWV 312350Z 14019KT 9999NDV SCT014/// OVC037/...,AUTO


Checking for nan and null values

In [4]:
metar_data.isna().any()

airport_identifier    False
issued_at             False
metar                 False
metartype              True
dtype: bool

In [5]:
metar_data.isnull().any()

airport_identifier    False
issued_at             False
metar                 False
metartype              True
dtype: bool

Metartype columns with NaN are manual messages. Filling these rows with 'MANUAL'

In [6]:
# Replace NaN with MANUAL
metar_data['metartype'].fillna('MANUAL', inplace=True)

In [7]:
metar_data[metar_data.duplicated(subset=['airport_identifier', 'issued_at'], keep=False)]

Unnamed: 0,airport_identifier,issued_at,metar,metartype
434,ENKB,2021-01-01 04:50:00,ENKB 010450Z 11004KT 070V150 9999 BKN090/// M0...,AUTO
439,ENKB,2021-01-01 04:50:00,ENKB 010450Z 11004KT 070V150 CAVOK M01/M03 Q10...,COR
490,ENGM,2021-01-01 05:50:00,ENGM 010550Z 02006KT 9999 -SN BKN014 M02/M03 Q...,COR
491,ENGM,2021-01-01 05:50:00,ENGM 010550Z 02006KT 9999 -SN BKN014 M02/M03 Q...,MANUAL
545,ENBO,2021-01-01 06:20:00,ENBO 010620Z 09011KT 9999 BKN060 02/M05 Q1011 ...,MANUAL
...,...,...,...,...
3275040,ENBN,2023-12-31 18:20:00,ENBN 311820Z 09013KT 060V120 CAVOK M04/M10 Q1015=,COR
3275044,ENTO,2023-12-31 18:50:00,ENTO 311850Z 03013KT 1000 SN VV007 M04/M04 Q10...,MANUAL
3275055,ENTO,2023-12-31 18:50:00,ENTO 311850Z 03013KT 1000 SN VV007 M04/M04 Q10...,COR
3275616,ENAT,2023-12-31 23:20:00,ENAT 312320Z 15013KT 9999 NCD M09/M14 Q1023 RM...,AUTO


## Handling duplicated values

Listing metartypes based on priority

In [8]:
metartype_prio = ['SPECI', 'COR', 'MANUAL', 'AUTO']

In [9]:
# Sorting the metar data based on the priority list

In [10]:
metar_data_sorted= metar_data.sort_values(by=['metartype'], key=lambda x: x.map({metartype: metartype_prio.index(metartype) for metartype in metartype_prio}))

In [11]:
metar_data_sorted

Unnamed: 0,airport_identifier,issued_at,metar,metartype
1017293,ENBS,2022-01-25 16:26:00,ENBS 251626Z 27028KT 1000 SN BLSN VV005 M03/M0...,SPECI
1448143,ENRS,2022-06-24 10:34:00,ENRS 241034Z 19008KT 9999 FEW006 SCT028 BKN037...,SPECI
2930725,ENVD,2023-09-23 10:30:00,ENVD 231030Z 28014KT 6000 RA FEW003 BKN010 BKN...,SPECI
1426855,ENSS,2022-06-17 14:32:00,ENSS 171432Z 36010KT 9999 FEW007 SCT015 OVC026...,SPECI
1426856,ENBV,2022-06-17 14:33:00,ENBV 171433Z 03008KT 9999 FEW012 SCT022 BKN032...,SPECI
...,...,...,...,...
2032606,ENLA,2022-12-24 23:50:00,ENLA 242350Z 16031KT 9000NDV BKN007/// BKN130/...,AUTO
2032605,ENFB,2022-12-24 23:50:00,ENFB 242350Z 14042KT 5000NDV -SHRA BKN011/// 0...,AUTO
2032604,ENSE,2022-12-24 23:50:00,ENSE 242350Z 14043KT 9999NDV -RA OVC019/// 05/...,AUTO
2032615,ENSL,2022-12-24 23:50:00,ENSL 242350Z 16037KT 9999NDV BKN009/// BKN090/...,AUTO


Remove duplicates based 'airport_identifier' and 'issued', keeping first occurrence of metartype in the prioritized order

In [12]:
metar_data_no_duplicates = metar_data_sorted.drop_duplicates(subset=['airport_identifier', 'issued_at'])

In [13]:
metar_data_no_duplicates

Unnamed: 0,airport_identifier,issued_at,metar,metartype
1017293,ENBS,2022-01-25 16:26:00,ENBS 251626Z 27028KT 1000 SN BLSN VV005 M03/M0...,SPECI
1448143,ENRS,2022-06-24 10:34:00,ENRS 241034Z 19008KT 9999 FEW006 SCT028 BKN037...,SPECI
2930725,ENVD,2023-09-23 10:30:00,ENVD 231030Z 28014KT 6000 RA FEW003 BKN010 BKN...,SPECI
1426855,ENSS,2022-06-17 14:32:00,ENSS 171432Z 36010KT 9999 FEW007 SCT015 OVC026...,SPECI
1426856,ENBV,2022-06-17 14:33:00,ENBV 171433Z 03008KT 9999 FEW012 SCT022 BKN032...,SPECI
...,...,...,...,...
2032606,ENLA,2022-12-24 23:50:00,ENLA 242350Z 16031KT 9000NDV BKN007/// BKN130/...,AUTO
2032605,ENFB,2022-12-24 23:50:00,ENFB 242350Z 14042KT 5000NDV -SHRA BKN011/// 0...,AUTO
2032604,ENSE,2022-12-24 23:50:00,ENSE 242350Z 14043KT 9999NDV -RA OVC019/// 05/...,AUTO
2032615,ENSL,2022-12-24 23:50:00,ENSL 242350Z 16037KT 9999NDV BKN009/// BKN090/...,AUTO


In [14]:
metar_data_no_duplicates[metar_data_no_duplicates.duplicated(subset=['airport_identifier', 'issued_at'])]

Unnamed: 0,airport_identifier,issued_at,metar,metartype


In [15]:
no_duplicates_left = metar_data_no_duplicates.duplicated(subset=['airport_identifier', 'issued_at']).sum() == 0

if no_duplicates_left:
    print("No duplicates left in the DataFrame.")
else:
    print("There are still duplicates in the DataFrame.")


No duplicates left in the DataFrame.


## Extract airports where FZ has been registered in METAR

In [16]:
# Add FZ column
metar_data_no_duplicates['contain_FZ'] = metar_data_no_duplicates['metar'].str.contains('FZ').astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metar_data_no_duplicates['contain_FZ'] = metar_data_no_duplicates['metar'].str.contains('FZ').astype(bool)


In [17]:
print(f'Number of airports: {metar_data_no_duplicates["airport_identifier"].nunique()}')
print(f'Number of airports with registered FZ in metar: {metar_data_no_duplicates[metar_data_no_duplicates["contain_FZ"]]["airport_identifier"].nunique()}')

Number of airports: 82
Number of airports with registered FZ in metar: 74


In [18]:
metar_data_with_FZ = metar_data_no_duplicates[metar_data_no_duplicates["contain_FZ"]]["airport_identifier"].unique()


In [19]:
metar_data_filtered = metar_data_no_duplicates[metar_data_no_duplicates['airport_identifier'].isin(metar_data_with_FZ)]

In [20]:
metar_data_filtered

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ
1017293,ENBS,2022-01-25 16:26:00,ENBS 251626Z 27028KT 1000 SN BLSN VV005 M03/M0...,SPECI,False
1448143,ENRS,2022-06-24 10:34:00,ENRS 241034Z 19008KT 9999 FEW006 SCT028 BKN037...,SPECI,False
2930725,ENVD,2023-09-23 10:30:00,ENVD 231030Z 28014KT 6000 RA FEW003 BKN010 BKN...,SPECI,False
1426855,ENSS,2022-06-17 14:32:00,ENSS 171432Z 36010KT 9999 FEW007 SCT015 OVC026...,SPECI,False
1426856,ENBV,2022-06-17 14:33:00,ENBV 171433Z 03008KT 9999 FEW012 SCT022 BKN032...,SPECI,False
...,...,...,...,...,...
2032606,ENLA,2022-12-24 23:50:00,ENLA 242350Z 16031KT 9000NDV BKN007/// BKN130/...,AUTO,False
2032605,ENFB,2022-12-24 23:50:00,ENFB 242350Z 14042KT 5000NDV -SHRA BKN011/// 0...,AUTO,False
2032604,ENSE,2022-12-24 23:50:00,ENSE 242350Z 14043KT 9999NDV -RA OVC019/// 05/...,AUTO,False
2032615,ENSL,2022-12-24 23:50:00,ENSL 242350Z 16037KT 9999NDV BKN009/// BKN090/...,AUTO,False


In [21]:
# Convert issued_at to datetime
metar_data_filtered['issued_at'] = pd.to_datetime(metar_data_filtered['issued_at'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metar_data_filtered['issued_at'] = pd.to_datetime(metar_data_filtered['issued_at'])


In [22]:
# Add 'time' column for rounding 'issued_at'

In [23]:
metar_data_filtered['time'] = metar_data_filtered['issued_at'].dt.round('H')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metar_data_filtered['time'] = metar_data_filtered['issued_at'].dt.round('H')


In [24]:
metar_data_filtered

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ,time
1017293,ENBS,2022-01-25 16:26:00,ENBS 251626Z 27028KT 1000 SN BLSN VV005 M03/M0...,SPECI,False,2022-01-25 16:00:00
1448143,ENRS,2022-06-24 10:34:00,ENRS 241034Z 19008KT 9999 FEW006 SCT028 BKN037...,SPECI,False,2022-06-24 11:00:00
2930725,ENVD,2023-09-23 10:30:00,ENVD 231030Z 28014KT 6000 RA FEW003 BKN010 BKN...,SPECI,False,2023-09-23 10:00:00
1426855,ENSS,2022-06-17 14:32:00,ENSS 171432Z 36010KT 9999 FEW007 SCT015 OVC026...,SPECI,False,2022-06-17 15:00:00
1426856,ENBV,2022-06-17 14:33:00,ENBV 171433Z 03008KT 9999 FEW012 SCT022 BKN032...,SPECI,False,2022-06-17 15:00:00
...,...,...,...,...,...,...
2032606,ENLA,2022-12-24 23:50:00,ENLA 242350Z 16031KT 9000NDV BKN007/// BKN130/...,AUTO,False,2022-12-25 00:00:00
2032605,ENFB,2022-12-24 23:50:00,ENFB 242350Z 14042KT 5000NDV -SHRA BKN011/// 0...,AUTO,False,2022-12-25 00:00:00
2032604,ENSE,2022-12-24 23:50:00,ENSE 242350Z 14043KT 9999NDV -RA OVC019/// 05/...,AUTO,False,2022-12-25 00:00:00
2032615,ENSL,2022-12-24 23:50:00,ENSL 242350Z 16037KT 9999NDV BKN009/// BKN090/...,AUTO,False,2022-12-25 00:00:00


In [25]:
metar_data_filtered[(metar_data_filtered['airport_identifier'] == 'ENWG') &( metar_data_filtered['contain_FZ'] == True)]

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ,time
2251986,ENWG,2023-03-06 02:50:00,ENWG 060250Z 34003KT 9999NDV -RA SCT007/// OVC...,AUTO,True,2023-03-06 03:00:00
2251918,ENWG,2023-03-06 02:20:00,ENWG 060220Z 30006KT 9999NDV -FZRA FEW005/// O...,AUTO,True,2023-03-06 02:00:00
2263050,ENWG,2023-03-09 11:50:00,ENWG 091150Z 30029KT 9999NDV -RA FEW024/// BKN...,AUTO,True,2023-03-09 12:00:00


ENWG airport is not registered in airport locations, and only has 3 METAR messages with FZ (AUTO-generated). Removing this airport_identifier.

In [26]:
metar_data_filtered = metar_data_filtered[metar_data_filtered['airport_identifier'] != 'ENWG']

In [27]:
print(f'Total unique timnestamps: {metar_data_filtered["time"].nunique()}')

Total unique timnestamps: 26281


In [28]:
# Drop duplicated timestamps after rounding
metar_data_final = metar_data_filtered.drop_duplicates(subset=['airport_identifier', 'time'])

In [29]:
metar_data_final

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ,time
1017293,ENBS,2022-01-25 16:26:00,ENBS 251626Z 27028KT 1000 SN BLSN VV005 M03/M0...,SPECI,False,2022-01-25 16:00:00
1448143,ENRS,2022-06-24 10:34:00,ENRS 241034Z 19008KT 9999 FEW006 SCT028 BKN037...,SPECI,False,2022-06-24 11:00:00
2930725,ENVD,2023-09-23 10:30:00,ENVD 231030Z 28014KT 6000 RA FEW003 BKN010 BKN...,SPECI,False,2023-09-23 10:00:00
1426855,ENSS,2022-06-17 14:32:00,ENSS 171432Z 36010KT 9999 FEW007 SCT015 OVC026...,SPECI,False,2022-06-17 15:00:00
1426856,ENBV,2022-06-17 14:33:00,ENBV 171433Z 03008KT 9999 FEW012 SCT022 BKN032...,SPECI,False,2022-06-17 15:00:00
...,...,...,...,...,...,...
2032632,ENSR,2022-12-25 00:20:00,ENSR 250020Z VRB03KT 9999 NCD M07/M09 Q1000 RM...,AUTO,False,2022-12-25 00:00:00
2032589,ENRA,2022-12-24 23:50:00,ENRA 242350Z 34002KT 9999 -FZDZ OVC013/// M08/...,AUTO,True,2022-12-25 00:00:00
2032578,ENBS,2022-12-24 23:50:00,ENBS 242350Z 35008KT 9999 OVC030/// M03/M08 Q0...,AUTO,False,2022-12-25 00:00:00
2032628,ENSD,2022-12-25 00:20:00,ENSD 250020Z 14008KT 9999 NCD M05/M08 Q1008 RM...,AUTO,False,2022-12-25 00:00:00


In [30]:
# Sort values based on 'airport_identifier' and 'time'
metar_data_final.sort_values(['airport_identifier', 'time'], inplace=True)

# Group by airport
metar_data_final.groupby(['airport_identifier'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metar_data_final.sort_values(['airport_identifier', 'time'], inplace=True)


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1502fd4869d0>

In [31]:
# Check for a specific airport
metar_data_final[(metar_data_final['airport_identifier'] == 'ENGM') & (metar_data_final['contain_FZ'] == True)]

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ,time
23884,ENGM,2021-01-10 13:50:00,ENGM 101350Z 00000KT 0300 R19R/0400N R01R/0350...,MANUAL,True,2021-01-10 14:00:00
24005,ENGM,2021-01-10 14:50:00,ENGM 101450Z 04002KT 5000 0900N R19R/1300U R01...,MANUAL,True,2021-01-10 15:00:00
24423,ENGM,2021-01-10 18:20:00,ENGM 101820Z 00000KT 1000 R19R/0700N R01R/P200...,MANUAL,True,2021-01-10 18:00:00
24551,ENGM,2021-01-10 19:20:00,ENGM 101920Z VRB02KT 0800 R19R/1700U R01R/0450...,MANUAL,True,2021-01-10 19:00:00
24673,ENGM,2021-01-10 20:20:00,ENGM 102020Z VRB01KT 0500 R19R/0600N R01R/1100...,MANUAL,True,2021-01-10 20:00:00
...,...,...,...,...,...,...
3273134,ENGM,2023-12-31 05:20:00,ENGM 310520Z 08003KT 050V110 9999 -FZDZ SCT007...,MANUAL,True,2023-12-31 05:00:00
3273197,ENGM,2023-12-31 05:50:00,ENGM 310550Z 07004KT 9999 -FZDZ FEW007 OVC010 ...,MANUAL,True,2023-12-31 06:00:00
3273418,ENGM,2023-12-31 07:20:00,ENGM 310720Z 07004KT 9999 OVC011 M04/M05 Q1012...,MANUAL,True,2023-12-31 07:00:00
3274575,ENGM,2023-12-31 15:20:00,ENGM 311520Z 07008KT 8000 -FZDZ OVC008 M05/M07...,MANUAL,True,2023-12-31 15:00:00


In [32]:
# Set airport and issued_at as index
#metar_data_final.set_index(['airport_identifier', 'time'], inplace=True, drop=True)

In [33]:
metar_data_final

Unnamed: 0,airport_identifier,issued_at,metar,metartype,contain_FZ,time
10,ENAL,2021-01-01 00:20:00,ENAL 010020Z 08005KT 9999 SCT024/// BKN037/// ...,AUTO,False,2021-01-01 00:00:00
95,ENAL,2021-01-01 01:20:00,ENAL 010120Z 08005KT 9999 FEW027/// SCT040/// ...,AUTO,False,2021-01-01 01:00:00
140,ENAL,2021-01-01 01:50:00,ENAL 010150Z 09006KT 9999 BKN025/// M01/M02 Q1...,AUTO,False,2021-01-01 02:00:00
267,ENAL,2021-01-01 03:20:00,ENAL 010320Z 09006KT 9999 OVC025/// 01/M02 Q1006=,AUTO,False,2021-01-01 03:00:00
360,ENAL,2021-01-01 04:20:00,ENAL 010420Z 17003KT 9999 OVC028/// 01/M01 Q1006=,AUTO,False,2021-01-01 04:00:00
...,...,...,...,...,...,...
3275166,ENZV,2023-12-31 19:50:00,ENZV 311950Z 10017KT CAVOK 05/M04 Q0994=,MANUAL,False,2023-12-31 20:00:00
3275285,ENZV,2023-12-31 20:50:00,ENZV 312050Z 10018KT CAVOK 05/M04 Q0994=,MANUAL,False,2023-12-31 21:00:00
3275473,ENZV,2023-12-31 22:20:00,ENZV 312220Z 10023G34KT CAVOK 05/M05 Q0994=,MANUAL,False,2023-12-31 22:00:00
3275588,ENZV,2023-12-31 22:50:00,ENZV 312250Z 10021KT CAVOK 05/M04 Q0994=,MANUAL,False,2023-12-31 23:00:00


## Save to CSV

In [34]:
output_file = 'metar_dataset_cleaned.csv'
metar_data_final.to_csv(output_file, index=False)