# Preprocessing of the dataset

In [1]:
# prepare
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Load & join the data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/pdsp/testset_kaggle.csv')
df

Unnamed: 0,Orig,con1_arr,con1,con2_arr,con2,Dest,mkt_airline1,mkt_flight1,op_airline1,op_flight1,...,elaptime,detour,arrDay,stops,dmcc,nLegs,cluster,OnD,traffic_direction,id
0,SXF,CAI,CAI,,,MED,MS,732,MS,732,...,535,1.02676,3,1,,2,111,SXF-MED,inbound,1
1,SXF,CAI,CAI,,,MED,MS,732,MS,732,...,535,1.02676,5,1,,2,111,SXF-MED,inbound,2
2,SXF,CAI,CAI,,,MED,MS,732,MS,732,...,535,1.02676,6,1,,2,111,SXF-MED,inbound,3
3,SXF,CAI,CAI,,,MED,MS,732,MS,732,...,775,1.02676,6,1,,2,111,SXF-MED,inbound,4
4,SXF,CAI,CAI,,,MED,MS,732,MS,732,...,535,1.02676,7,1,,2,111,SXF-MED,inbound,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241520,YDF,YYZ,YYZ,,,FRA,AC,1703,AC,1703,...,1290,1.77380,3,1,C,2,1601,YDF-FRA,outbound,241521
241521,YUL,FRA,FRA,,,QDU,LH,6795,AC,874,...,686,1.06290,1,1,DY,2,1601,YUL-QDU,outbound,241522
241522,SFO,JFK,JFK,,,FRA,B6,1516,B6,1516,...,1541,1.13030,4,1,Y,2,1601,SFO-FRA,outbound,241523
241523,SFO,ORD,ORD,HEL,HEL,MUC,AA,2691,AA,2691,...,1430,1.23530,7,2,C,3,1601,SFO-MUC,outbound,241524


## 2. Clean the data

In [4]:
df.columns

Index(['Orig', 'con1_arr', 'con1', 'con2_arr', 'con2', 'Dest', 'mkt_airline1',
       'mkt_flight1', 'op_airline1', 'op_flight1', 'depDay1', 'd_time_1',
       'a_time_1', 'depTime_U1', 'arrTime_U1', 'acrt1', 'distSeg1',
       'mkt_airline2', 'mkt_flight2', 'op_airline2', 'op_flight2', 'depDay2',
       'd_time_2', 'a_time_2', 'depTime_U2', 'arrTime_U2', 'acrt2', 'distSeg2',
       'mkt_airline3', 'mkt_flight3', 'op_airline3', 'op_flight3', 'depDay3',
       'd_time_3', 'a_time_3', 'depTime_U3', 'arrTime_U3', 'acrt3', 'distSeg3',
       'dist', 'segn', 'week', 'depDay', 'deptime', 'deptime_U', 'arrtime',
       'arrtime_U', 'elaptime', 'detour', 'arrDay', 'stops', 'dmcc', 'nLegs',
       'cluster', 'OnD', 'traffic_direction', 'id'],
      dtype='object')

### STEP 2: DROP

[what to include]<br/>
1. Input
- user: 'Orig', 'Dest', (or 'cluster') 'depDay', 'arrDay',
- inside of a model: distSeg1', 'distSeg2', 'distSeg3', 'segn', 'elaptime'<br/>
  'detour': (distseg1+distseg2+distseg3)/dist)<br/>
  'market_share': Itinerary passenger share (based on OnD estimation total)
- little details that could be included or not: 'con1', 'con2', 'depDay2', 'depDay3',

2. output
- Routes overview: Provides statistics and map visualization about the itinerary contained in the data source (or a subset): 'real_dist'<br/>
- Market model: Estimations of traffic (passengers) for itinerary and market cluster, applying Transformer Graph Models/ML models: TOT_pax or paxe, cluster<br/>
  For any network itinerary entered in the app, the estimated traffic is provided.<br/>
  Based on the market model estimation, a ranking of all itinerary in the dataset per estimated traffic, with recommendations for creation or cancellation of routes.<br/>
  Per market cluster, the accuracy of the model is provided.
  - 'TOT_pax': Total passengers per OnD
  - 'paxe': Number (absolute) passenger
  - 'cluster': Market cluster
- little details: 'deptime', 'arrtime', 'stops'

[what to drop]<br/>
- 'con1_arr', 'con2_arr': connecting airport (mostly = arroval airport)
- 'mkt_airline1', 'mkt_flight1', 'op_airline1', 'op_flight1', 'acrt1', -- aircrafe type, etc.
- 'mkt_airline2', 'mkt_flight2', 'op_airline2', 'op_flight2', 'acrt2',
- 'mkt_airline3', 'mkt_flight3', 'op_airline3', 'op_flight3', 'acrt3',
- 'dmcc': compartment (economic, business)
- 'dist': will calculate the real distance by distSeg1 + ... / also 'detour' exists
- 'week': minor detail / also, can use depDay1
- 'nLegs', 'depDay1',: trivial -- 'segn', 'depDay', 'deptime'
- 'OnD': info provided by Orig and Dest column / also recommended by lecturers
- 'deptime_U', 'arrtime_U', 'depTime_U1', 'arrTime_U1', 'depTime_U2', 'arrTime_U2', 'depTime_U3', 'arrTime_U3': too much detail
- 'd_time_1', 'a_time_1', 'd_time_2', 'a_time_2', 'd_time_3', 'a_time_3': too much detail


In [5]:
# later for calculation: 'arrTime_U1', 'depTime_U2', 'arrTime_U2', 'depTime_U3',
df = df.drop(columns=['con1_arr', 'con2_arr', 'depDay1',
                      'mkt_airline1', 'mkt_flight1', 'op_airline1', 'op_flight1', 'acrt1',
                      'mkt_airline2', 'mkt_flight2', 'op_airline2', 'op_flight2', 'acrt2',
                      'mkt_airline3', 'mkt_flight3', 'op_airline3', 'op_flight3', 'acrt3',
                      'dmcc', 'dist', 'week', 'nLegs', 'OnD',
                      'd_time_1', 'a_time_1', 'd_time_2', 'a_time_2', 'd_time_3', 'a_time_3',
                      'deptime_U', 'arrtime_U', 'depTime_U1', 'arrTime_U3', 'traffic_direction']) # 'traffic_direction' wasn't presented

### STEP 3: DATA TYPE

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241525 entries, 0 to 241524
Data columns (total 23 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Orig        241525 non-null  object 
 1   con1        239625 non-null  object 
 2   con2        59099 non-null   object 
 3   Dest        241525 non-null  object 
 4   arrTime_U1  241525 non-null  object 
 5   distSeg1    241525 non-null  float64
 6   depDay2     181944 non-null  float64
 7   depTime_U2  239625 non-null  object 
 8   arrTime_U2  239625 non-null  object 
 9   distSeg2    241525 non-null  float64
 10  depDay3     1400 non-null    float64
 11  depTime_U3  59099 non-null   object 
 12  distSeg3    80018 non-null   float64
 13  segn        241525 non-null  int64  
 14  depDay      241525 non-null  int64  
 15  deptime     241525 non-null  object 
 16  arrtime     241525 non-null  object 
 17  elaptime    241525 non-null  int64  
 18  detour      241525 non-null  float64
 19  ar

In [7]:
# Data Type:
# convert the column to a datetime data type.
df['deptime'] = pd.to_datetime(df['deptime'], format='%H:%M:%S').dt.time
df['arrtime'] = pd.to_datetime(df['arrtime'], format='%H:%M:%S').dt.time

In [8]:
## skip becuase depTime_U3 only has time info

# # calculate the total waiting time for connection
# df_org['depTime_U2'] = pd.to_datetime(df_org['depTime_U2'])
# df_org['arrTime_U1'] = pd.to_datetime(df_org['arrTime_U1'])
# df_org['depTime_U3'] = pd.to_datetime(df_org['depTime_U3'])
# df_org['arrTime_U2'] = pd.to_datetime(df_org['arrTime_U2'])

# # time for connection
# df['con_time'] = (df_org['depTime_U2']- df_org['arrTime_U1']) + (df_org['depTime_U3'] - df_org['arrTime_U2'])

# # calculte the time in minutes
# def calculate_in_time(dateobj):
#   total = 0
#   if not pd.isnull(dateobj):
#     total += dateobj.days * 24
#     total += dateobj.seconds // 60
#   return total

# df['con_time'] = df['con_time'].apply(lambda x: calculate_in_time(x))
# df['con_time']
df = df.drop(columns=['arrTime_U1', 'depTime_U2', 'arrTime_U2', 'depTime_U3'])

In [9]:
# check the content
df.head()

Unnamed: 0,Orig,con1,con2,Dest,distSeg1,depDay2,distSeg2,depDay3,distSeg3,segn,depDay,deptime,arrtime,elaptime,detour,arrDay,stops,cluster,id
0,SXF,CAI,,MED,2872.51,2.0,1026.61,,,2,2,15:45:00,01:40:00,535,1.02676,3,1,111,1
1,SXF,CAI,,MED,2872.51,4.0,1026.61,,,2,4,15:45:00,01:40:00,535,1.02676,5,1,111,2
2,SXF,CAI,,MED,2872.51,5.0,1026.61,,,2,5,15:45:00,01:40:00,535,1.02676,6,1,111,3
3,SXF,CAI,,MED,2872.51,6.0,1026.61,,,2,5,15:45:00,05:40:00,775,1.02676,6,1,111,4
4,SXF,CAI,,MED,2872.51,6.0,1026.61,,,2,6,15:45:00,01:40:00,535,1.02676,7,1,111,5


### STEP 4: MISSING VALUES
1. depDay2, depDay3: Leave the null values as they are
- Tree-based algorithms like decision trees and random forests can work with missing data. You can indicate that a day of the week is missing by leaving it as NaN, and the algorithm will make decisions based on the available data.


2. Week related: depDay, depDay2, depDay3, arrDay
- not change yet. depend on model

In [10]:
# see missing values
df.isna().sum()

Orig             0
con1          1900
con2        182426
Dest             0
distSeg1         0
depDay2      59581
distSeg2         0
depDay3     240125
distSeg3    161507
segn             0
depDay           0
deptime          0
arrtime          0
elaptime         0
detour           0
arrDay           0
stops            0
cluster          0
id               0
dtype: int64

### STEP 5: column edition

In [11]:
# is_direct_flight
df['is_direct_flight'] = df['con1'].isnull().astype(int)

In [None]:
# cross_day flight
df['cross_day'] = (df['depDay'] != df['arrDay']).astype(int)

In [12]:
# see dataset again
df.head()

Unnamed: 0,Orig,con1,con2,Dest,distSeg1,depDay2,distSeg2,depDay3,distSeg3,segn,depDay,deptime,arrtime,elaptime,detour,arrDay,stops,cluster,id,is_direct_flight
0,SXF,CAI,,MED,2872.51,2.0,1026.61,,,2,2,15:45:00,01:40:00,535,1.02676,3,1,111,1,0
1,SXF,CAI,,MED,2872.51,4.0,1026.61,,,2,4,15:45:00,01:40:00,535,1.02676,5,1,111,2,0
2,SXF,CAI,,MED,2872.51,5.0,1026.61,,,2,5,15:45:00,01:40:00,535,1.02676,6,1,111,3,0
3,SXF,CAI,,MED,2872.51,6.0,1026.61,,,2,5,15:45:00,05:40:00,775,1.02676,6,1,111,4,0
4,SXF,CAI,,MED,2872.51,6.0,1026.61,,,2,6,15:45:00,01:40:00,535,1.02676,7,1,111,5,0


In [13]:
# performing one-hot encoding (stops)
df = pd.get_dummies(df, columns=['stops'], prefix='stops')

In [14]:
# performing one-hot encoding (segn)
df = pd.get_dummies(df, columns=['segn'], prefix='segn')

In [15]:
# Preparation: fill in nas with 0
df['distSeg1'] = df['distSeg1'].fillna(0)
df['distSeg2'] = df['distSeg2'].fillna(0)
df['distSeg3'] = df['distSeg3'].fillna(0)

In [16]:
# Calculate the actual distance for each trip
df['real_dist'] = df['distSeg1'] + df['distSeg2'] + df['distSeg3']
df = df.drop(columns=['distSeg1', 'distSeg2', 'distSeg3'])
df['real_dist']

0          3899.12
1          3899.12
2          3899.12
3          3899.12
4          3899.12
            ...   
241520     8143.00
241521     6034.81
241522    10336.84
241523    11651.70
241524    11512.87
Name: real_dist, Length: 241525, dtype: float64

In [17]:
# see missing value
df.isna().sum()

Orig                     0
con1                  1900
con2                182426
Dest                     0
depDay2              59581
depDay3             240125
depDay                   0
deptime                  0
arrtime                  0
elaptime                 0
detour                   0
arrDay                   0
cluster                  0
id                       0
is_direct_flight         0
stops_0                  0
stops_1                  0
stops_2                  0
segn_1                   0
segn_2                   0
segn_3                   0
real_dist                0
dtype: int64

In [19]:
# Label encoding of Orig, con1, con2, Dest
# Random Forests = scale invariant. do not assume higher weightage for large values & figure out how best to split up the numeric values to meet your objective
# https://datascience.stackexchange.com/questions/92647/encode-the-days-of-week-as-numeric-variable
# https://medium.com/aiskunks/categorical-data-encoding-techniques-d6296697a40f
# https://towardsdatascience.com/handling-categorical-data-the-right-way-9d1279956fc6
# -- we use rf, hence, big number doesn't matter.

# 1. mapping airports to countries
# Create a dictionary to map unique combinations of "OnD" to category numbers
airport = pd.read_csv('/content/drive/MyDrive/pdsp/airport-codes_csv.csv')
airport = airport[~pd.isnull(airport['iata_code'])][['iso_country', 'iata_code']].reset_index(drop=True)
air_cou_mapping = {d['iata_code']: d['iso_country'] for d in airport.to_dict(orient='records')}
df[['Orig', 'con1', 'con2', 'Dest']] = df[['Orig', 'con1', 'con2', 'Dest']].replace(air_cou_mapping)
df

Unnamed: 0,Orig,con1,con2,Dest,depDay2,depDay3,depDay,deptime,arrtime,elaptime,...,cluster,id,is_direct_flight,stops_0,stops_1,stops_2,segn_1,segn_2,segn_3,real_dist
0,DE,EG,,SA,2.0,,2,15:45:00,01:40:00,535,...,111,1,0,0,1,0,0,1,0,3899.12
1,DE,EG,,SA,4.0,,4,15:45:00,01:40:00,535,...,111,2,0,0,1,0,0,1,0,3899.12
2,DE,EG,,SA,5.0,,5,15:45:00,01:40:00,535,...,111,3,0,0,1,0,0,1,0,3899.12
3,DE,EG,,SA,6.0,,5,15:45:00,05:40:00,775,...,111,4,0,0,1,0,0,1,0,3899.12
4,DE,EG,,SA,6.0,,6,15:45:00,01:40:00,535,...,111,5,0,0,1,0,0,1,0,3899.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241520,CA,CA,,DE,2.0,,2,06:00:00,08:00:00,1290,...,1601,241521,0,0,1,0,0,1,0,8143.00
241521,CA,DE,,QDU,1.0,,7,19:05:00,12:31:00,686,...,1601,241522,0,0,1,0,0,1,0,6034.81
241522,US,US,,DE,3.0,,2,23:59:00,10:40:00,1541,...,1601,241523,0,0,1,0,0,1,0,10336.84
241523,US,US,FI,DE,6.0,7.0,6,09:00:00,17:50:00,1430,...,1601,241524,0,0,0,1,0,0,1,11651.70


In [20]:
# 2. countries to label
countries_lst = list(air_cou_mapping.values())
countries_lst = [x for i, x in enumerate(countries_lst) if x not in countries_lst[:i]]
# print(countries_lst)
countries_to_num = {v:i for i,v in enumerate(countries_lst)}
# print(countries_to_num)
df[['Orig', 'con1', 'con2', 'Dest']] = df[['Orig', 'con1', 'con2', 'Dest']].replace(countries_to_num)
df.head()

Unnamed: 0,Orig,con1,con2,Dest,depDay2,depDay3,depDay,deptime,arrtime,elaptime,...,cluster,id,is_direct_flight,stops_0,stops_1,stops_2,segn_1,segn_2,segn_3,real_dist
0,26,64,103,181,2.0,,2,15:45:00,01:40:00,535,...,111,1,0,0,1,0,0,1,0,3899.12
1,26,64,103,181,4.0,,4,15:45:00,01:40:00,535,...,111,2,0,0,1,0,0,1,0,3899.12
2,26,64,103,181,5.0,,5,15:45:00,01:40:00,535,...,111,3,0,0,1,0,0,1,0,3899.12
3,26,64,103,181,6.0,,5,15:45:00,05:40:00,775,...,111,4,0,0,1,0,0,1,0,3899.12
4,26,64,103,181,6.0,,6,15:45:00,01:40:00,535,...,111,5,0,0,1,0,0,1,0,3899.12


In [21]:
# check for rows which are not converted
indexes = []
for idx, row in df.iterrows():
  try:
    if type(row['Orig']) != int or type(row['con1']) != int or type(row['con2']) != int or type(row['Dest']) != int:
      indexes.append(idx)
  except TypeError:
    continue

df.loc[indexes]

Unnamed: 0,Orig,con1,con2,Dest,depDay2,depDay3,depDay,deptime,arrtime,elaptime,...,cluster,id,is_direct_flight,stops_0,stops_1,stops_2,segn_1,segn_2,segn_3,real_dist
2354,26,6,103,ZVJ,6.0,,5,22:35:00,12:10:00,695,...,111,2355,0,0,1,0,0,1,0,4696.18
2366,26,6,103,ZVJ,7.0,,6,12:00:00,04:15:00,855,...,111,2367,0,0,1,0,0,1,0,4974.90
2367,26,6,103,ZVJ,7.0,,6,15:15:00,04:15:00,660,...,111,2368,0,0,1,0,0,1,0,4974.90
22567,138,145,103,LZS,1.0,,1,14:50:00,18:44:00,294,...,806,22568,0,0,1,0,0,1,0,1447.48
22568,138,145,103,LZS,7.0,,7,14:50:00,19:44:00,354,...,806,22569,0,0,1,0,0,1,0,1447.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241380,41,26,103,QDU,4.0,,3,16:20:00,14:36:00,796,...,1601,241381,0,0,1,0,0,1,0,8243.14
241382,41,26,103,QDU,4.0,,3,13:25:00,10:46:00,741,...,1601,241383,0,0,1,0,0,1,0,8243.14
241479,1,26,103,QDU,7.0,,6,16:55:00,12:13:00,798,...,1601,241480,0,0,1,0,0,1,0,7942.52
241481,1,26,103,QDU,2.0,,1,16:55:00,10:46:00,711,...,1601,241482,0,0,1,0,0,1,0,7942.52


In [22]:
print(df.loc[indexes].Orig.unique())
print(df.loc[indexes].con1.unique())
print(df.loc[indexes].con2.unique())
print(df.loc[indexes].Dest.unique())

[26 138 47 'ZFV' 'XDS' 1 41 'QRH' 130]
[  6 145   1  41  26  60   8]
[103  70  65  60  98]
['ZVJ' 'LZS' 26 'ZAQ' 'QDU' 'XNB' 44 'RZG' 'SAJ']


In [25]:
countries_to_num['BD']

131

In [26]:
# manually mapping
extra_mapping = {'ZFV': 1, 'QRH': 70, 'ZVJ': 6, 'LZS': 145, 'ZAQ': 26, 'QDU': 26, 'XNB': 6, 'RZG': 8, 'SAJ': 131}
cols = ['Orig', 'con1', 'con2', 'Dest']
for col in cols:
  df[col] = df[col].apply(lambda x: extra_mapping[x] if x in extra_mapping else x)

# check whether they're mapped correctly
df.loc[indexes]

Unnamed: 0,Orig,con1,con2,Dest,depDay2,depDay3,depDay,deptime,arrtime,elaptime,...,cluster,id,is_direct_flight,stops_0,stops_1,stops_2,segn_1,segn_2,segn_3,real_dist
2354,26,6,103,6,6.0,,5,22:35:00,12:10:00,695,...,111,2355,0,0,1,0,0,1,0,4696.18
2366,26,6,103,6,7.0,,6,12:00:00,04:15:00,855,...,111,2367,0,0,1,0,0,1,0,4974.90
2367,26,6,103,6,7.0,,6,15:15:00,04:15:00,660,...,111,2368,0,0,1,0,0,1,0,4974.90
22567,138,145,103,145,1.0,,1,14:50:00,18:44:00,294,...,806,22568,0,0,1,0,0,1,0,1447.48
22568,138,145,103,145,7.0,,7,14:50:00,19:44:00,354,...,806,22569,0,0,1,0,0,1,0,1447.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241380,41,26,103,26,4.0,,3,16:20:00,14:36:00,796,...,1601,241381,0,0,1,0,0,1,0,8243.14
241382,41,26,103,26,4.0,,3,13:25:00,10:46:00,741,...,1601,241383,0,0,1,0,0,1,0,8243.14
241479,1,26,103,26,7.0,,6,16:55:00,12:13:00,798,...,1601,241480,0,0,1,0,0,1,0,7942.52
241481,1,26,103,26,2.0,,1,16:55:00,10:46:00,711,...,1601,241482,0,0,1,0,0,1,0,7942.52


## 3. Filter data

In [27]:
# preprocess time further
# Departure time
# Extract dep_hour
df['dep_hour'] = [time_obj.hour for time_obj in df['deptime']]
df['dep_min'] = [time_obj.minute for time_obj in df['deptime']]

# drop
df.drop(["deptime"], axis=1, inplace=True)

# Arrival time
# Extract arr_hour
df['arr_hour'] = [time_obj.hour for time_obj in df['arrtime']]
df['arr_min'] = [time_obj.minute for time_obj in df['arrtime']]

# drop
df.drop(["arrtime"], axis=1, inplace=True)

In [28]:
# show df fully

# Configure pandas to display all columns
pd.set_option('display.max_columns', None)

# Print the entire DataFrame
print(df)

       Orig  con1  con2  Dest  depDay2  depDay3  depDay  elaptime   detour  \
0        26    64   103   181      2.0      NaN       2       535  1.02676   
1        26    64   103   181      4.0      NaN       4       535  1.02676   
2        26    64   103   181      5.0      NaN       5       535  1.02676   
3        26    64   103   181      6.0      NaN       5       775  1.02676   
4        26    64   103   181      6.0      NaN       6       535  1.02676   
...     ...   ...   ...   ...      ...      ...     ...       ...      ...   
241520   41    41   103    26      2.0      NaN       2      1290  1.77380   
241521   41    26   103    26      1.0      NaN       7       686  1.06290   
241522    1     1   103    26      3.0      NaN       2      1541  1.13030   
241523    1     1    63    26      6.0      7.0       6      1430  1.23530   
241524    1    45   103    26      4.0      NaN       4      1000  1.48380   

        arrDay  cluster      id  is_direct_flight  stops_0  sto

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241525 entries, 0 to 241524
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Orig              241525 non-null  object 
 1   con1              241525 non-null  int64  
 2   con2              241525 non-null  int64  
 3   Dest              241525 non-null  int64  
 4   depDay2           181944 non-null  float64
 5   depDay3           1400 non-null    float64
 6   depDay            241525 non-null  int64  
 7   elaptime          241525 non-null  int64  
 8   detour            241525 non-null  float64
 9   arrDay            241525 non-null  int64  
 10  cluster           241525 non-null  int64  
 11  id                241525 non-null  int64  
 12  is_direct_flight  241525 non-null  int64  
 13  stops_0           241525 non-null  uint8  
 14  stops_1           241525 non-null  uint8  
 15  stops_2           241525 non-null  uint8  
 16  segn_1            24

In [30]:
df.shape

(241525, 24)

## 4. Engineer features

### STEP 6: feature selection

In [31]:
df.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'depDay2', 'depDay3', 'depDay',
       'elaptime', 'detour', 'arrDay', 'cluster', 'id', 'is_direct_flight',
       'stops_0', 'stops_1', 'stops_2', 'segn_1', 'segn_2', 'segn_3',
       'real_dist', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min'],
      dtype='object')

In [33]:
# 0-1 boolean
df['stops_0'] = df['stops_0'].replace({True: 1, False: 0})
df['stops_1'] = df['stops_1'].replace({True: 1, False: 0})
df['stops_2'] = df['stops_2'].replace({True: 1, False: 0})
df['segn_1'] = df['segn_1'].replace({True: 1, False: 0})
df['segn_2'] = df['segn_2'].replace({True: 1, False: 0})
df['segn_3'] = df['segn_3'].replace({True: 1, False: 0})

In [34]:
# modified version
df.to_csv('/content/drive/MyDrive/pdsp/testset_kaggle_prep.csv')