In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [66]:
# separate columns by ;
df = pd.read_csv('../atis_dataset.csv', sep=';')

In [69]:
df.head()

Unnamed: 0,airport_icao,visibility,wind_speed,wind_direction,rvr,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
0,KIAD,10.0,6,180,>6000,19,L,No ILS,5.9,1.0,-1.0,,0,0,0,1
1,KSFO,10.0,8,260,>6000,28,L,CAT III,7.5,2.7,-1.0,,1,0,0,0
2,KORD,10.0,5,310,>6000,28,R,CAT III,4.3,2.5,5000.0,,1,0,0,0
3,KSFO,10.0,5,260,>6000,28,L,CAT III,4.7,1.7,-1.0,,1,0,0,0
4,KSFO,10.0,5,260,>6000,28,L,CAT III,4.7,1.7,-1.0,,1,0,0,0


In [70]:
df.iloc[22]

airport_icao                   KORD
visibility                     10.0
wind_speed                        8
wind_direction                  300
rvr                          >6000 
runway_designator_number         27
runway_designator_side            R
runway_ils_category         CAT III
headwind                        6.9
crosswind                       4.0
ceiling                        -1.0
weather_phenomenon              NaN
ILS                               1
RNAV                              0
RNP                               0
VISUAL                            0
Name: 22, dtype: object

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3789 entries, 0 to 3788
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   airport_icao              3789 non-null   object 
 1   visibility                3789 non-null   float64
 2   wind_speed                3789 non-null   int64  
 3   wind_direction            3789 non-null   int64  
 4   rvr                       3789 non-null   object 
 5   runway_designator_number  3789 non-null   int64  
 6   runway_designator_side    3749 non-null   object 
 7   runway_ils_category       3789 non-null   object 
 8   headwind                  3789 non-null   float64
 9   crosswind                 3789 non-null   float64
 10  ceiling                   3785 non-null   float64
 11  weather_phenomenon        470 non-null    object 
 12  ILS                       3789 non-null   int64  
 13  RNAV                      3789 non-null   int64  
 14  RNP     

In [72]:
# check null values before preprocess
df.isnull().sum()

airport_icao                   0
visibility                     0
wind_speed                     0
wind_direction                 0
rvr                            0
runway_designator_number       0
runway_designator_side        40
runway_ils_category            0
headwind                       0
crosswind                      0
ceiling                        4
weather_phenomenon          3319
ILS                            0
RNAV                           0
RNP                            0
VISUAL                         0
dtype: int64

In [73]:
# delete rows where rvr = FFF
df = df[df['rvr'] != 'FFF']

In [74]:
# change N/A weather_phenomenon to "None"
df['weather_phenomenon'] = df['weather_phenomenon'].fillna('None')

In [75]:
# fill empty runway_designator_side to X
df['runway_designator_side'] = df['runway_designator_side'].fillna('X') # X means no L, R, or C

In [76]:
# check after preprocessed
df.head()

Unnamed: 0,airport_icao,visibility,wind_speed,wind_direction,rvr,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
0,KIAD,10.0,6,180,>6000,19,L,No ILS,5.9,1.0,-1.0,,0,0,0,1
1,KSFO,10.0,8,260,>6000,28,L,CAT III,7.5,2.7,-1.0,,1,0,0,0
2,KORD,10.0,5,310,>6000,28,R,CAT III,4.3,2.5,5000.0,,1,0,0,0
3,KSFO,10.0,5,260,>6000,28,L,CAT III,4.7,1.7,-1.0,,1,0,0,0
4,KSFO,10.0,5,260,>6000,28,L,CAT III,4.7,1.7,-1.0,,1,0,0,0


In [77]:
df.iloc[22]

airport_icao                   KORD
visibility                     10.0
wind_speed                        8
wind_direction                  300
rvr                          >6000 
runway_designator_number         27
runway_designator_side            R
runway_ils_category         CAT III
headwind                        6.9
crosswind                       4.0
ceiling                        -1.0
weather_phenomenon             None
ILS                               1
RNAV                              0
RNP                               0
VISUAL                            0
Name: 22, dtype: object

In [78]:
# check null values after preprocessed
df.isnull().sum()

airport_icao                0
visibility                  0
wind_speed                  0
wind_direction              0
rvr                         0
runway_designator_number    0
runway_designator_side      0
runway_ils_category         0
headwind                    0
crosswind                   0
ceiling                     4
weather_phenomenon          0
ILS                         0
RNAV                        0
RNP                         0
VISUAL                      0
dtype: int64

In [79]:
# see rows with null value in any column
df[df.isnull().any(axis=1)]

Unnamed: 0,airport_icao,visibility,wind_speed,wind_direction,rvr,runway_designator_number,runway_designator_side,runway_ils_category,headwind,crosswind,ceiling,weather_phenomenon,ILS,RNAV,RNP,VISUAL
2503,KJFK,1.0,6,160,6000,22,L,CAT III,3.0,5.2,,BR,1,0,0,0
2504,KJFK,1.0,6,160,>6000,22,R,CAT III,3.0,5.2,,BR,1,0,0,0
3396,KJFK,1.0,12,130,3000▲,22,L,CAT III,0.0,12.0,,- DZ BR,1,0,0,0
3397,KJFK,1.0,12,130,2000▼,22,R,CAT III,0.0,12.0,,- DZ BR,1,0,0,0


In [80]:
df.describe()

Unnamed: 0,visibility,wind_speed,wind_direction,runway_designator_number,headwind,crosswind,ceiling,ILS,RNAV,RNP,VISUAL
count,3776.0,3776.0,3776.0,3776.0,3776.0,3776.0,3772.0,3776.0,3776.0,3776.0,3776.0
mean,9.231263,10.588983,204.052436,21.245233,7.038639,5.865095,5631.824761,0.673994,0.0,0.0,0.326006
std,2.232963,5.891336,100.360592,9.484556,6.333738,4.75816,8453.511628,0.468812,0.0,0.0,0.468812
min,0.25,0.0,-1.0,1.0,-10.3,0.0,-1.0,0.0,0.0,0.0,0.0
25%,10.0,6.0,150.0,19.0,2.5,2.3,-1.0,0.0,0.0,0.0,0.0
50%,10.0,10.0,210.0,27.0,6.5,5.0,800.0,1.0,0.0,0.0,0.0
75%,10.0,15.0,290.0,28.0,10.725,8.5,7500.0,1.0,0.0,0.0,1.0
max,10.0,29.0,360.0,31.0,29.0,25.6,30000.0,1.0,0.0,0.0,1.0


In [81]:
# Save the cleaned DataFrame
df.to_csv('atis_dataset_preprocessed.csv', sep=';', index=False)