In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [29]:
path = 'Training_imp_Features/imp_flights_Jan_1w.csv'

In [30]:
df = pd.read_csv(path)

In [31]:
df.head()

Unnamed: 0,fl_date,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,crs_arr_time,arr_time,arr_delay,cancelled,dup,distance,weather_delay,late_aircraft_delay,icao24,manufacturername,built
0,2019-01-07,13495,13303,1631,1623.0,-8.0,1924,1908.0,-16.0,0.0,N,675.0,,,a56323,Embraer S A,2015-01-01
1,2018-01-02,10397,14683,925,924.0,-1.0,1105,1058.0,-7.0,0.0,N,874.0,,,a717b6,Boeing,2011-01-01
2,2018-01-08,13487,15016,1523,1520.0,-3.0,1707,1702.0,-5.0,0.0,N,448.0,,,a3bd81,Bombardier Inc,2002-01-01
3,2019-01-01,12889,11292,1945,2015.0,30.0,2230,2259.0,29.0,0.0,N,628.0,0.0,13.0,a53b69,Boeing,2003-01-01
4,2019-01-06,12982,12892,1420,1505.0,45.0,2140,2224.0,44.0,0.0,N,2615.0,0.0,44.0,a1a18f,Airbus,2017-01-01


In [32]:
filters = (df['cancelled']==0) & (df['dup']=='N')

In [33]:
# Filter out cancelled flight, duplicate record. Drop 'cancelled', 'dup', 'icao24'
df = df[filters].drop(['cancelled', 'dup', 'icao24'], axis=1)

In [34]:
# Fill null values for 'weather_delay', 'late_aircraft_delay', 'dep_delay' with 0             note: after check, null for dep_delay means airplane departured on time.
df[['dep_delay', 'weather_delay', 'late_aircraft_delay']] = df[['dep_delay', 'weather_delay', 'late_aircraft_delay']].fillna(0.0)

In [35]:
# Drop rows that contain null values in 'arr_time'
df.dropna(subset=['arr_time'], inplace=True)

In [36]:
# Fill null values in 'arr_delay' by substracting 'crs_arr_time' from 'arr_time'
df['arr_delay'] = df['arr_delay'].fillna(df['arr_time'] - df['crs_arr_time'])

In [38]:
# Combine same brand of manufactures
manufactures = {
    'Embraer S A': 'Embraer',
    'Embraer-empresa Brasileira De': 'Embraer',
    'Mcdonnell Douglas Corporation': 'Mcdonnell Douglas',
    'Mcdonnell Douglas Aircraft Co': 'Mcdonnell Douglas',
    'Airbus Industrie': 'Airbus',
    'Boeing Of Canada/dehav Div': 'Boeing',
    'Saab-scania': 'Saab',
    'Gulfstream Aerospace Corp': 'Gulfstream Aerospace',
    'Bombardier Inc': 'Bombardier'
}

df['manufacturername'].replace(manufactures, inplace=True)

In [41]:
# One hot encode the 'manufacturername'
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded = enc.fit_transform(df['manufacturername'].to_numpy().reshape(-1,1))
ohe_df = pd.DataFrame(encoded, columns=enc.get_feature_names(['man']))
ohe_df.head()

Unnamed: 0,man_Aerospatiale/alenia,man_Air Tractor Inc,man_Airbus,man_American Champion Aircraft,man_Avions De Transport Regional,man_Bell Helicopter Textron Canada,man_Benham John,man_Boeing,man_Bombardier,man_Canadair,...,man_Mcdonnell Douglas,man_Mooney Aircraft Corp.,man_Pilatus,man_Piper,man_Quest Aircraft Company Llc,man_Robinson Helicopter Co,man_Saab,man_Socata,man_Tremble Jason M,man_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df = pd.concat([df.reset_index(), ohe_df.reset_index()], axis=1).drop(['index', 'manufacturername', 'man_nan'], axis=1)

In [43]:
# Create 'aircraft_age' and fillna with mean value
df['aircraft_age'] = df['fl_date'].str[:4].astype(float) - df['built'].str[:4].astype(float)
df['aircraft_age'] = df['aircraft_age'].fillna(int(df['aircraft_age'].mean()))
df['aircraft_age'] = df['aircraft_age'].astype(int)

In [44]:
# Drop 'built'
df.drop(['built'], axis=1, inplace=True)

In [45]:
# Print out all columns after data cleaning
print('Columns after data cleaning\n')
print(df.columns)

Columns after data cleaning

Index(['fl_date', 'origin_airport_id', 'dest_airport_id', 'crs_dep_time',
       'dep_time', 'dep_delay', 'crs_arr_time', 'arr_time', 'arr_delay',
       'distance', 'weather_delay', 'late_aircraft_delay',
       'man_Aerospatiale/alenia', 'man_Air Tractor Inc', 'man_Airbus',
       'man_American Champion Aircraft', 'man_Avions De Transport Regional',
       'man_Bell Helicopter Textron Canada', 'man_Benham John', 'man_Boeing',
       'man_Bombardier', 'man_Canadair', 'man_Cessna',
       'man_Cirrus Design Corp', 'man_Dassault', 'man_De Havilland Canada',
       'man_Diamond Aircraft Ind Gmbh', 'man_Eads/alenia Atr',
       'man_Ela Aviacion', 'man_Embraer', 'man_Grumman',
       'man_Gulfstream Aerospace', 'man_Mcdonnell Douglas',
       'man_Mooney Aircraft Corp.', 'man_Pilatus', 'man_Piper',
       'man_Quest Aircraft Company Llc', 'man_Robinson Helicopter Co',
       'man_Saab', 'man_Socata', 'man_Tremble Jason M', 'aircraft_age'],
      dtype='object'

In [46]:
# Final check if there is any null values exist
print(f'\nIs there any null values now: {df.isnull().any().any()}')


Is there any null values now: False


In [47]:
# Export cleaned data
file_name = path.split('/')[-1]
df.to_csv(f'cleaned_{file_name}', index=None)