In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [38]:
# separate columns by ;
df = pd.read_csv('../atis_dataset.csv', sep=';')

In [39]:
df.head()

Unnamed: 0,atis_id,airport_icao,visibility,wind_speed,wind_gust,wind_direction,rvr,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
0,1,KIAD,10.0,6,0,180,>6000,19,L,CAT III,5.9,1.0,-1.0,,1,0,0,1
1,1,KIAD,10.0,6,0,180,>6000,19,C,CAT III,5.9,1.0,-1.0,,1,0,0,1
2,1,KIAD,10.0,6,0,180,>6000,19,R,CAT III,5.9,1.0,-1.0,,1,0,0,1
3,2,KLAX,10.0,4,0,40,>6000,6,R,No ILS,3.8,1.4,-1.0,,0,1,1,0
4,3,KSFO,10.0,8,0,260,>6000,28,L,CAT III,7.5,2.7,-1.0,,1,0,0,0


In [40]:
df.iloc[22]

atis_id                         15
airport_icao                  KATL
visibility                    10.0
wind_speed                       3
wind_gust                        0
wind_direction                 240
rvr                         >6000 
runway_designator_number        28
runway_designator_side         NaN
runway_ils_category         No ILS
headwind                       2.3
crosswind                      1.9
ceiling                       -1.0
weather_phenomenon             NaN
ILS                              0
RNAV                             0
RNP                              0
VISUAL                           1
Name: 22, dtype: object

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6613 entries, 0 to 6612
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   atis_id                   6613 non-null   int64  
 1   airport_icao              6613 non-null   object 
 2   visibility                6613 non-null   float64
 3   wind_speed                6613 non-null   int64  
 4   wind_gust                 6613 non-null   int64  
 5   wind_direction            6613 non-null   int64  
 6   rvr                       6613 non-null   object 
 7   runway_designator_number  6613 non-null   int64  
 8   runway_designator_side    6041 non-null   object 
 9   runway_ils_category       6613 non-null   object 
 10  headwind                  6613 non-null   float64
 11  crosswind                 6613 non-null   float64
 12  ceiling                   6609 non-null   float64
 13  weather_phenomenon        583 non-null    object 
 14  ILS     

In [42]:
# check null values before preprocess
df.isnull().sum()

atis_id                        0
airport_icao                   0
visibility                     0
wind_speed                     0
wind_gust                      0
wind_direction                 0
rvr                            0
runway_designator_number       0
runway_designator_side       572
runway_ils_category            0
headwind                       0
crosswind                      0
ceiling                        4
weather_phenomenon          6030
ILS                            0
RNAV                           0
RNP                            0
VISUAL                         0
dtype: int64

In [43]:
# delete rows where rvr = FFF
df = df[df['rvr'] != 'FFF']

In [44]:
# change N/A weather_phenomenon to "None"
df['weather_phenomenon'] = df['weather_phenomenon'].fillna('None')

In [45]:
# fill empty runway_designator_side to X
df['runway_designator_side'] = df['runway_designator_side'].fillna('X') # X means no L, R, or C

In [46]:
# clear whitespaces in rvr column
df['rvr'] = df['rvr'].str.strip()

In [47]:
# extract non-digit characters only (excluding decimal point)
df['rvr_tendency'] = df['rvr'].astype(str).apply(lambda x: ''.join(re.findall(r'\D+', x)).strip())
df['rvr'] = df['rvr'].astype(str).str.extract(r'(\d+)', expand=False).astype(float)

In [48]:
# remove the column temporarily
rvr_tendency = df.pop('rvr_tendency')

# find the index of 'rvr' column
rvr_index = df.columns.get_loc('rvr')

# insert it right after 'rvr'
df.insert(rvr_index + 1, 'rvr_tendency', rvr_tendency)


In [50]:
# check after preprocessed
df.head()

Unnamed: 0,atis_id,airport_icao,visibility,wind_speed,wind_gust,wind_direction,rvr,rvr_tendency,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
0,1,KIAD,10.0,6,0,180,6000.0,>,19,L,CAT III,5.9,1.0,-1.0,,1,0,0,1
1,1,KIAD,10.0,6,0,180,6000.0,>,19,C,CAT III,5.9,1.0,-1.0,,1,0,0,1
2,1,KIAD,10.0,6,0,180,6000.0,>,19,R,CAT III,5.9,1.0,-1.0,,1,0,0,1
3,2,KLAX,10.0,4,0,40,6000.0,>,6,R,No ILS,3.8,1.4,-1.0,,0,1,1,0
4,3,KSFO,10.0,8,0,260,6000.0,>,28,L,CAT III,7.5,2.7,-1.0,,1,0,0,0


In [51]:
df.iloc[22]

atis_id                         15
airport_icao                  KATL
visibility                    10.0
wind_speed                       3
wind_gust                        0
wind_direction                 240
rvr                         6000.0
rvr_tendency                     >
runway_designator_number        28
runway_designator_side           X
runway_ils_category         No ILS
headwind                       2.3
crosswind                      1.9
ceiling                       -1.0
weather_phenomenon            None
ILS                              0
RNAV                             0
RNP                              0
VISUAL                           1
Name: 22, dtype: object

In [52]:
# check null values after preprocessed
df.isnull().sum()

atis_id                     0
airport_icao                0
visibility                  0
wind_speed                  0
wind_gust                   0
wind_direction              0
rvr                         0
rvr_tendency                0
runway_designator_number    0
runway_designator_side      0
runway_ils_category         0
headwind                    0
crosswind                   0
ceiling                     4
weather_phenomenon          0
ILS                         0
RNAV                        0
RNP                         0
VISUAL                      0
dtype: int64

In [53]:
# see rows with null value in any column
df[df.isnull().any(axis=1)]

Unnamed: 0,atis_id,airport_icao,visibility,wind_speed,wind_gust,wind_direction,rvr,rvr_tendency,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
4390,2526,KJFK,1.0,6,0,160,6000.0,,22,L,CAT III,3.0,5.2,,BR,1,0,0,0
4391,2526,KJFK,1.0,6,0,160,6000.0,>,22,R,CAT III,3.0,5.2,,BR,1,0,0,0
5897,3357,KJFK,1.0,12,0,130,3000.0,▲,22,L,CAT III,0.0,12.0,,- DZ BR,1,0,0,0
5898,3357,KJFK,1.0,12,0,130,2000.0,▼,22,R,CAT III,0.0,12.0,,- DZ BR,1,0,0,0


In [54]:
# see columns datatypes after preprocessed
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6591 entries, 0 to 6612
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   atis_id                   6591 non-null   int64  
 1   airport_icao              6591 non-null   object 
 2   visibility                6591 non-null   float64
 3   wind_speed                6591 non-null   int64  
 4   wind_gust                 6591 non-null   int64  
 5   wind_direction            6591 non-null   int64  
 6   rvr                       6591 non-null   float64
 7   rvr_tendency              6591 non-null   object 
 8   runway_designator_number  6591 non-null   int64  
 9   runway_designator_side    6591 non-null   object 
 10  runway_ils_category       6591 non-null   object 
 11  headwind                  6591 non-null   float64
 12  crosswind                 6591 non-null   float64
 13  ceiling                   6587 non-null   float64
 14  weather_pheno

In [57]:
df.describe()

Unnamed: 0,atis_id,visibility,wind_speed,wind_gust,wind_direction,rvr,runway_designator_number,headwind,crosswind,ceiling,ILS,RNAV,RNP,VISUAL
count,6591.0,6591.0,6591.0,6591.0,6591.0,6591.0,6591.0,6591.0,6591.0,6587.0,6591.0,6591.0,6591.0,6591.0
mean,1897.942042,9.4769,9.66788,7.619785,206.41587,5959.748141,20.107116,6.505492,5.239266,5597.77486,0.742528,0.109695,0.077075,0.65635
std,1082.086293,1.819284,5.626002,12.048984,100.944406,367.597124,9.669064,5.903299,4.505573,8682.266004,0.437275,0.312533,0.26673,0.474962
min,1.0,0.25,0.0,0.0,-1.0,0.0,1.0,-10.4,0.0,-1.0,0.0,0.0,0.0,0.0
25%,951.5,10.0,5.0,0.0,150.0,6000.0,10.0,2.5,1.9,-1.0,0.0,0.0,0.0,0.0
50%,1904.0,10.0,9.0,0.0,220.0,6000.0,24.0,6.0,4.0,-1.0,1.0,0.0,0.0,1.0
75%,2823.5,10.0,13.0,20.0,290.0,6000.0,28.0,10.0,7.7,7500.0,1.0,0.0,0.0,1.0
max,3754.0,10.0,29.0,43.0,360.0,6000.0,31.0,29.0,25.6,30000.0,1.0,1.0,1.0,1.0


In [58]:
# Save the cleaned DataFrame
df.to_csv('atis_dataset_preprocessed.csv', sep=';', index=False)