# Exploring Air Traffic Data of Germany

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from geopy.distance import geodesic 

from IPython.display import display

pd.options.display.max_columns = None
pd.options.display.max_rows = 1000

## Import data

In [2]:
path = '../data/'

In [3]:
df_raw = pd.read_csv(path + 'flight_data_de.csv')

## Explore data: some descriptive stats

### General

In [4]:
df = df_raw.iloc[:, 1:]

In [5]:
df.head(20)

Unnamed: 0,fr_country,fr_airport,to_country,to_airport,month,flight_d,seat_d,passenger_d
0,DE,EDDB,BE,EBBR,2019-11-01,47.0,8166.0,7306.0
1,DE,EDDB,BG,LBSF,2019-11-01,26.0,4914.0,4406.0
2,DE,EDDB,CH,LSGG,2019-11-01,50.0,7896.0,6586.0
3,DE,EDDB,CH,LSZM,2019-11-01,45.0,7372.0,6341.0
4,DE,EDDB,DE,EDDK,2019-11-01,8.0,64.0,14.0
5,DE,EDDB,DK,EKCH,2019-11-01,62.0,10473.0,8753.0
6,DE,EDDB,EL,LGAV,2019-11-01,28.0,5122.0,4159.0
7,DE,EDDB,EL,LGTS,2019-11-01,27.0,4968.0,3946.0
8,DE,EDDB,ES,GCTS,2019-11-01,23.0,4308.0,3921.0
9,DE,EDDB,ES,LEBL,2019-11-01,96.0,17222.0,15668.0


In [6]:
df.describe()

Unnamed: 0,flight_d,seat_d,passenger_d
count,161011.0,161011.0,161011.0
mean,84.296725,12885.603592,9569.505164
std,100.45024,15255.707276,11410.283864
min,0.0,0.0,0.0
25%,22.0,4346.0,3214.0
50%,52.0,8105.0,5923.0
75%,112.0,15104.5,11045.0
max,824.0,148928.0,116241.0


In [7]:
cols = ['util_rate', 'num_flights', 'seats', 'passengers', 'flight']

### What to analyze

<b>KPIs:</b>

- [ ] number of flights (sum, mean) // number of passengers
- [x] utilization (passengers /seats)

<b>Along the following dimensions:</b>
- [ ] per connection
- [ ] per airport
- [ ] outgoing (from DE)
- [ ] incoming (to DE)




### Analysis

#### Calculate utilization rate `util_rate` (passengers / seats)

In [8]:
df = df.assign(util_rate = df['passengers'] / df['seats'])

KeyError: 'passengers'

In [None]:
df['util_rate'].describe()

In [None]:
df['util_rate'].hist();

In [None]:
df.head()

In [None]:
df[cols].isna().sum()

replace NaN values in `util_rate`

In [None]:
df['util_rate'] = df['util_rate'].fillna(0)

#### Calculate `category`: international or national flight

In [None]:
df = df.assign(domestic=(df['fr_country'] == df['to_country']).astype(int)) 

#### Plot distributions

In [None]:
cols

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(9,16))


i = 0
for ax in axes.flatten():
    sns.distplot(df[cols[i]], ax=ax);
    i+=1

# fig.show()

In [None]:
cols = ['date', 'flight', 'util_rate', 'num_flights', 'seats', 'passengers']

df[cols]

#### Aggregate by month: `df_month`

In [None]:
df_month = df[cols].groupby('date').agg({
    'util_rate': 'mean',
    'num_flights': 'sum',
    'seats': 'sum',
    'passengers': 'sum',
})

In [None]:
df_month

In [None]:
df_month = df_month.reset_index()
df_month['date'] = pd.to_datetime(df_month['date'], format='%Y-%m-%d')

In [None]:
df_month.info()

#### Deep-Dive: 2017-2018

In [None]:
df_month['date'][0].year

In [None]:
# df_grouped[df_grouped['date'].between(pd.to_datetime('2018-01-01', format='%Y-%m-%d'),pd.to_datetime('2017-01-01', format='%Y-%m-%d'))]

df_month[df_month['date'].dt.year.between(2017, 2018)]



In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12,14))

start_year = 2018
end_year = 2018

sns.lineplot(data=df_month[df_month['date'].dt.year.between(start_year, end_year)], x='date', y='num_flights', ax=axes[0]);
sns.lineplot(data=df_month[df_month['date'].dt.year.between(start_year, end_year)], x='date', y='seats', ax=axes[1]);
sns.lineplot(data=df_month[df_month['date'].dt.year.between(start_year, end_year)], x='date', y='passengers', ax=axes[2]);
sns.lineplot(data=df_month[df_month['date'].dt.year.between(start_year, end_year)], x='date', y='util_rate', ax=axes[3]);



# df[['util_rate']].plot(ax = axes[0,1])

# i = 0
# for ax in axes.flatten():
#     sns.lineplot(df_grouped, 
#                  # x='date',
#                  y=cols[i],
#                  ax=ax);
#     i+=1

#### Descriptive Statistics by Flight Connection

In [None]:
df.head(3)

In [None]:
cols

In [None]:
df_flight = df[cols].groupby('flight').agg({
    'util_rate': 'mean',
    'num_flights': 'sum',
    'seats': 'sum',
    'passengers': 'sum',
})

In [None]:
df_flight.sort_values(by='num_flights', ascending=False)

In [None]:
df.to_airport.unique()

#### Get geocode information

In [None]:
df_geo = pd.read_csv(path + 'world_airports.csv')

In [None]:
df_geo.head(500)

In [None]:
icao1 = 'AYGA'
df_geo.loc[df_geo['icao'] == icao1, 'latitude'].iloc[0]

In [None]:
df_geo.loc[df_geo['icao'] == icao1, 'latitude'].iloc[0]

In [None]:
def get_distance(icao1, icao2):

    try:
        coord1 = df_geo.loc[df_geo['icao'] == icao1, 'latitude'].iloc[0], df_geo.loc[df_geo['icao'] == icao1, 'longitude'].iloc[0]
        coord2 = df_geo.loc[df_geo['icao'] == icao2, 'latitude'].iloc[0], df_geo.loc[df_geo['icao'] == icao2, 'longitude'].iloc[0]
        dist = geodesic(coord1, coord2).km

    except:
        dist = 0
        print(icao1 + icao2)

    return dist


In [None]:
get_distance('AYGA', 'AYWK')

In [None]:
df_test = df.head(20).copy()

In [None]:
df_test

In [None]:
# df_test['distance'] = df_test.apply(lambda x: get_distance(x['fr_airport'], x['to_airport']), axis=1)

In [None]:
# df['distance'] = df.apply(lambda x: get_distance(x['fr_airport'], x['to_airport']), axis=1)

In [None]:
missing_airports = ['LSZM', 'LYPR', 'DTNZ', 'FAJS', 'HECA', 'GMAD']

In [None]:
df_geo[df_geo['icao'].isin(missing_airports)]['airport_id'].count()

There are no entries in `df_geo` for the airports listed in `missing_airports`

Check how many flights are affected by the missing airports:

In [None]:
len(df[df['to_airport'].isin(missing_airports)])

In [None]:
len(df)

In [None]:
len(df[df['to_airport'].isin(missing_airports)]) / len(df)

Around 1% of flights are affected by the missing airports, so we will drop those rows for now...

<b>TODO: There are more airports with missing lat-long data than only those listed above...</b>

In [None]:
df = df[~df['to_airport'].isin(missing_airports)]

In [None]:
df_geo[df_geo['icao'].eq('EDDK')]

Get unique list of connections to pass into `get_distance` function

In [None]:
df.head(3)

Create a dataframe with unique flight connections to calculate distances, otherwise it takes too long...

In [None]:
df_unique_conn = df.drop_duplicates(subset='flight').copy()

In [None]:
df_unique_conn.shape

In [None]:
df_unique_conn['distance'] = df_unique_conn.apply(lambda x: get_distance(x['fr_airport'], x['to_airport']), axis=1)

In [None]:
df_unique_conn.head()

Merge distances back to df

In [None]:
df_with_dist = df.merge(df_unique_conn[['flight', 'distance']], how='left', on='flight')

In [None]:
df_with_dist.head(50)

In [None]:
df_with_dist.describe()

Note that there are flights with distance = 0. In these cases, the `to_airport` could not be found in `df_geo` and therefore the distance could not be calculated.

Drop rows with distance = 0

In [None]:
df_with_dist = df_with_dist[df_with_dist['distance'] > 0]

In [None]:
df_with_dist.to_csv(path + 'flight_data_de_with_distances.csv')

#### testing random stuff...

In [None]:
df_test = df.copy()

In [None]:
df_test['lat'] = 45
df_test['long'] = 8

In [None]:
df_test.head()

In [None]:
for index, row in df_test[['lat', 'long']].head(10).iterrows():
    coordinate = row['lat'], row['long']
    print(coordinate)
    print(type(coordinate))
    print()

#### Create df with distances

In [9]:
df_dist = pd.read_csv(path + 'flight_data_de_with_distances.csv')

In [11]:
df_dist = df_dist.iloc[:, 1:]

In [12]:
df_dist.head()

Unnamed: 0,flight,fr_country,fr_airport,to_country,to_airport,date,num_flights,seats,passengers,util_rate,domestic,distance
0,DE_EDDB_BE_EBBR,DE,EDDB,BE,EBBR,2019-11-01,93.0,16309.0,14512.0,0.889815,0,646.455762
1,DE_EDDB_BG_LBSF,DE,EDDB,BG,LBSF,2019-11-01,52.0,9828.0,8940.0,0.909646,0,1306.35829
2,DE_EDDB_CH_LSGG,DE,EDDB,CH,LSGG,2019-11-01,104.0,15845.0,13302.0,0.839508,0,869.238264
3,DE_EDDB_DE_EDDK,DE,EDDB,DE,EDDK,2019-11-01,19.0,413.0,280.0,0.677966,1,472.64616
4,DE_EDDB_DK_EKCH,DE,EDDB,DK,EKCH,2019-11-01,126.0,20975.0,17203.0,0.820167,0,364.838246


In [13]:
df_dist_2 = df_dist.drop_duplicates(subset='flight')

In [14]:
df_dist_2.head()

Unnamed: 0,flight,fr_country,fr_airport,to_country,to_airport,date,num_flights,seats,passengers,util_rate,domestic,distance
0,DE_EDDB_BE_EBBR,DE,EDDB,BE,EBBR,2019-11-01,93.0,16309.0,14512.0,0.889815,0,646.455762
1,DE_EDDB_BG_LBSF,DE,EDDB,BG,LBSF,2019-11-01,52.0,9828.0,8940.0,0.909646,0,1306.35829
2,DE_EDDB_CH_LSGG,DE,EDDB,CH,LSGG,2019-11-01,104.0,15845.0,13302.0,0.839508,0,869.238264
3,DE_EDDB_DE_EDDK,DE,EDDB,DE,EDDK,2019-11-01,19.0,413.0,280.0,0.677966,1,472.64616
4,DE_EDDB_DK_EKCH,DE,EDDB,DK,EKCH,2019-11-01,126.0,20975.0,17203.0,0.820167,0,364.838246


In [15]:
df_dist_3 = df_dist_2[['flight', 'distance']].copy()

In [16]:
df_dist_3.to_csv(path + 'flight_data_de_with_distances_v2.csv')