In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [6]:
def clean_data(path, tvt, csv):
    df = pd.read_csv(path + csv)
    filters = (df['cancelled']==0) & (df['dup']=='N')
    # Filter out cancelled flight, duplicate record. Drop 'cancelled', 'dup', 'icao24'
    df = df[filters].drop(['cancelled', 'dup', 'icao24'], axis=1)
    # Fill null values for 'weather_delay', 'late_aircraft_delay', 'dep_delay' with 0             note: after check, null for dep_delay means airplane departured on time.
    df[['dep_delay', 'weather_delay', 'late_aircraft_delay']] = df[['dep_delay', 'weather_delay', 'late_aircraft_delay']].fillna(0.0)
    # Drop rows that contain null values in 'arr_time'
    df.dropna(subset=['arr_time'], inplace=True)
    # Fill null values in 'arr_delay' by substracting 'crs_arr_time' from 'arr_time'
    df['arr_delay'] = df['arr_delay'].fillna(df['arr_time'] - df['crs_arr_time'])
    # Combine same brand of manufactures
    manufactures = {
        'Embraer S A': 'Embraer',
        'Embraer-empresa Brasileira De': 'Embraer',
        'Mcdonnell Douglas Corporation': 'Mcdonnell Douglas',
        'Mcdonnell Douglas Aircraft Co': 'Mcdonnell Douglas',
        'Airbus Industrie': 'Airbus',
        'Boeing Of Canada/dehav Div': 'Boeing',
        'Saab-scania': 'Saab',
        'Gulfstream Aerospace Corp': 'Gulfstream Aerospace',
        'Bombardier Inc': 'Bombardier'
    }
    df['manufacturername'].replace(manufactures, inplace=True)
    # One hot encode the 'manufacturername'
    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded = enc.fit_transform(df['manufacturername'].to_numpy().reshape(-1,1))
    ohe_df = pd.DataFrame(encoded, columns=enc.get_feature_names(['man']))
    df = pd.concat([df.reset_index(), ohe_df.reset_index()], axis=1).drop(['index', 'manufacturername', 'man_nan'], axis=1)
    # Create 'aircraft_age' and fillna with mean value
    df['aircraft_age'] = df['fl_date'].str[:4].astype(float) - df['built'].str[:4].astype(float)
    df['aircraft_age'] = df['aircraft_age'].fillna(int(df['aircraft_age'].mean()))
    df['aircraft_age'] = df['aircraft_age'].astype(int)
    df.drop(['built'], axis=1, inplace=True)
    df = pd.get_dummies(df, columns=["type"], drop_first=True)
    print(f'\nIs there any null values now: {df.isnull().any().any()}')
    df.to_csv(f'{path}finalcleaned_{csv}', index=None)

In [7]:
path = 'Training/'
train = 'train_'
test = 'test_'
file1 = 'finalraw_flights_Dec_Jan.csv'
file2 = 'finalraw_flights_Dec.csv' 
file3 = 'finalraw_flights_Jan_1w.csv' 
file4 = 'finalraw_flights_Jan_2w.csv' 
file5 = 'finalraw_flights_Jan.csv' 
files = [file1,file2,file3,file4,file5]

In [8]:
for i in files:
    clean_data(path, train, i)




Is there any null values now: False





Is there any null values now: False





Is there any null values now: False





Is there any null values now: False





Is there any null values now: False


In [9]:
path = 'Validation/'
for i in files:
    clean_data(path, test, i)




Is there any null values now: False





Is there any null values now: False





Is there any null values now: False





Is there any null values now: False





Is there any null values now: False
