In [1]:
import numpy as np
import pandas as pd
import xarray as xr

In [2]:
metar_data = pd.read_csv('data/metar_msg_overview.csv')

In [3]:
# Convert issued_at to datetime
metar_data['issued_at'] = pd.to_datetime(metar_data['issued_at'])

### Round to each whole hour for METAR and then remove duplicates

In [4]:
metar_data

Unnamed: 0,airport_identifier,issued_at,metar,metartype
0,ENDR,2021-01-01 00:20:00,ENDR 010020Z 14007KT 03/M02 Q1006=,AUTO
1,ENZV,2021-01-01 00:20:00,ENZV 010020Z 13006KT 9999 SCT026 BKN035 03/01 ...,
2,ENTC,2021-01-01 00:20:00,ENTC 010020Z 21015KT CAVOK 02/M03 Q1013 RMK WI...,
3,ENSB,2021-01-01 00:20:00,ENSB 010020Z 24004KT 9999 -SN FEW009 BKN033 M0...,
4,ENGM,2021-01-01 00:20:00,ENGM 010020Z 36006KT 9999 4900E -SN FEW009 OVC...,
...,...,...,...,...
3275708,ENLA,2023-12-31 23:50:00,ENLA 312350Z 11021KT 9999NDV BKN036/// 07/06 Q...,AUTO
3275709,ENUS,2023-12-31 23:50:00,ENUS 312350Z 10033G43KT 9999NDV BKN022/// 00/M...,AUTO
3275710,ENOL,2023-12-31 23:50:00,ENOL 312350Z 12029G41KT 9999 DRSN NSC M03/M09 ...,
3275711,ENWV,2023-12-31 23:50:00,ENWV 312350Z 14019KT 9999NDV SCT014/// OVC037/...,AUTO


In [5]:
metar_data_xarray = xr.Dataset.from_dataframe(metar_data)

In [6]:
metar_data_xarray

In [7]:
metar_data_xarray['time'] = metar_data_xarray['issued_at'].dt.round('H')

In [8]:
metar_data_pandas = metar_data_xarray.to_dataframe()

In [9]:
metar_data_pandas.head(50)

Unnamed: 0_level_0,airport_identifier,issued_at,metar,metartype,time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ENDR,2021-01-01 00:20:00,ENDR 010020Z 14007KT 03/M02 Q1006=,AUTO,2021-01-01 00:00:00
1,ENZV,2021-01-01 00:20:00,ENZV 010020Z 13006KT 9999 SCT026 BKN035 03/01 ...,,2021-01-01 00:00:00
2,ENTC,2021-01-01 00:20:00,ENTC 010020Z 21015KT CAVOK 02/M03 Q1013 RMK WI...,,2021-01-01 00:00:00
3,ENSB,2021-01-01 00:20:00,ENSB 010020Z 24004KT 9999 -SN FEW009 BKN033 M0...,,2021-01-01 00:00:00
4,ENGM,2021-01-01 00:20:00,ENGM 010020Z 36006KT 9999 4900E -SN FEW009 OVC...,,2021-01-01 00:00:00
5,ENBO,2021-01-01 00:20:00,ENBO 010020Z 11013KT CAVOK 01/M06 Q1009=,,2021-01-01 00:00:00
6,ENVA,2021-01-01 00:20:00,ENVA 010020Z 13008KT CAVOK M06/M08 Q1007 RMK W...,,2021-01-01 00:00:00
7,ENGC,2021-01-01 00:20:00,ENGC 010020Z 02024KT 9999 FEW020 SCT030 07/03 ...,,2021-01-01 00:00:00
8,ENFL,2021-01-01 00:20:00,ENFL 010020Z 36013KT 9999NDV FEW170/// 04/M03 ...,AUTO,2021-01-01 00:00:00
9,ENML,2021-01-01 00:20:00,ENML 010020Z 02007KT 9999NDV NCD M02/M05 Q1006=,AUTO,2021-01-01 00:00:00


In [10]:
metar_data_pandas= metar_data_pandas.drop_duplicates(subset=['airport_identifier', 'time']).reset_index(drop=True)

In [11]:
# Sort data based on airport and timestamp
metar_data_pandas.sort_values(['airport_identifier', 'time'], inplace=True)

# Set airport and issued_at as index
metar_data_pandas.set_index(['airport_identifier', 'time'], inplace=True)

# Group by airport
metar_data_pandas.groupby(['airport_identifier'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f53e4370dc0>

In [12]:
# Replace NaN with MANUAL
metar_data_pandas['metartype'].fillna('MANUAL', inplace=True)

In [13]:
metar_data_pandas

Unnamed: 0_level_0,Unnamed: 1_level_0,issued_at,metar,metartype
airport_identifier,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENAL,2021-01-01 00:00:00,2021-01-01 00:20:00,ENAL 010020Z 08005KT 9999 SCT024/// BKN037/// ...,AUTO
ENAL,2021-01-01 01:00:00,2021-01-01 00:50:00,ENAL 010050Z 09006KT 9999 FEW025/// 00/M02 Q1006=,AUTO
ENAL,2021-01-01 02:00:00,2021-01-01 01:50:00,ENAL 010150Z 09006KT 9999 BKN025/// M01/M02 Q1...,AUTO
ENAL,2021-01-01 03:00:00,2021-01-01 02:50:00,ENAL 010250Z 10005KT 9999 OVC029/// 00/M02 Q1006=,AUTO
ENAL,2021-01-01 04:00:00,2021-01-01 03:50:00,ENAL 010350Z VRB03KT 9999 OVC024/// 02/M01 Q1006=,AUTO
...,...,...,...,...
ENZV,2023-12-31 20:00:00,2023-12-31 19:50:00,ENZV 311950Z 10017KT CAVOK 05/M04 Q0994=,MANUAL
ENZV,2023-12-31 21:00:00,2023-12-31 20:50:00,ENZV 312050Z 10018KT CAVOK 05/M04 Q0994=,MANUAL
ENZV,2023-12-31 22:00:00,2023-12-31 21:50:00,ENZV 312150Z 10022KT CAVOK 05/M05 Q0994=,MANUAL
ENZV,2023-12-31 23:00:00,2023-12-31 22:50:00,ENZV 312250Z 10021KT CAVOK 05/M04 Q0994=,MANUAL


### Add flag for messages containing FZ


In [14]:
metar_data_pandas['contain_FZ'] = metar_data_pandas['metar'].str.contains('FZ').astype(bool)

In [15]:
metar_data_pandas

Unnamed: 0_level_0,Unnamed: 1_level_0,issued_at,metar,metartype,contain_FZ
airport_identifier,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENAL,2021-01-01 00:00:00,2021-01-01 00:20:00,ENAL 010020Z 08005KT 9999 SCT024/// BKN037/// ...,AUTO,False
ENAL,2021-01-01 01:00:00,2021-01-01 00:50:00,ENAL 010050Z 09006KT 9999 FEW025/// 00/M02 Q1006=,AUTO,False
ENAL,2021-01-01 02:00:00,2021-01-01 01:50:00,ENAL 010150Z 09006KT 9999 BKN025/// M01/M02 Q1...,AUTO,False
ENAL,2021-01-01 03:00:00,2021-01-01 02:50:00,ENAL 010250Z 10005KT 9999 OVC029/// 00/M02 Q1006=,AUTO,False
ENAL,2021-01-01 04:00:00,2021-01-01 03:50:00,ENAL 010350Z VRB03KT 9999 OVC024/// 02/M01 Q1006=,AUTO,False
...,...,...,...,...,...
ENZV,2023-12-31 20:00:00,2023-12-31 19:50:00,ENZV 311950Z 10017KT CAVOK 05/M04 Q0994=,MANUAL,False
ENZV,2023-12-31 21:00:00,2023-12-31 20:50:00,ENZV 312050Z 10018KT CAVOK 05/M04 Q0994=,MANUAL,False
ENZV,2023-12-31 22:00:00,2023-12-31 21:50:00,ENZV 312150Z 10022KT CAVOK 05/M05 Q0994=,MANUAL,False
ENZV,2023-12-31 23:00:00,2023-12-31 22:50:00,ENZV 312250Z 10021KT CAVOK 05/M04 Q0994=,MANUAL,False


In [16]:
metar_data_xr = metar_data_pandas.to_xarray()

In [18]:
output_file = 'metar_dataset.nc'
metar_data_xr.to_netcdf(output_file)

In [19]:
metar_data_reopened = xr.open_dataset('metar_dataset.nc')

In [20]:
metar_data_reopened