# Exercise 2

Top 10 arrival airports in the world in 2013 (using the bookings file)

## Load libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
path_bookings = "../Data/bookings.csv.bz2"
path_cleaned_bookings = "../Data/cleaned_bookings.csv.bz2"
path_grouped_bookings = "../Data/grouped_bookings.csv"

## Load raw data in chunks

As the file is so big (529 MB) that it can't fit in the memory of my computer, I read it in chunks.

In [3]:
bookings_iter = pd.read_csv(
    path_bookings,
    compression='bz2',
    engine='c',
    chunksize=500000,
    delimiter="^",
)

## Delete duplicates (only first time)

In [4]:
cleaned_bookings = pd.DataFrame()

for bookings_chunk in bookings_iter:
    cleaned_bookings = (
        cleaned_bookings
        .append(bookings_chunk)
        .drop_duplicates()
    )

In [5]:
cleaned_bookings.shape[0]

1016377

In [6]:
cleaned_bookings = (
    cleaned_bookings
    .apply(lambda col: col.str.strip() if col.dtypes == "object" else col)
    .convert_dtypes()
    .drop_duplicates()
)

In [7]:
cleaned_bookings.shape[0]

1016377

In [8]:
(
    cleaned_bookings
    .rename(columns=str.strip)
    .to_csv(path_cleaned_bookings, sep="^", index=False)
)

## Cleaned data at once

**Plan action**
1. Load cleaned data in one big chunk
2. Fill NaNs with 0
3. Covert columns to more suitable dtypes
4. Group by *airport*
5. Sum number of passengers
6. Sort by number of passengers
7. Get top 10 airports

### Step 1: Load cleaned data

In [9]:
bookings = pd.read_csv(
    path_cleaned_bookings,
    compression='bz2',
    engine='c',
    delimiter="^",
    usecols=['arr_port', 'pax'],
    dtype={'pax':'Int64'},
)

### Step 2-6

In [10]:
passengers_per_airport = (
    cleaned_bookings
    .fillna({'pax': 0})
    .convert_dtypes()
    #.rename(columns=str.strip)
    .groupby(["arr_port"])
    .agg(count=("pax", np.sum))
    .sort_values('count', ascending=False)
    .reset_index()
)

In [11]:
passengers_per_airport.to_csv(path_grouped_bookings, sep="^", index=False)

### Step 7: Get top 10 airports

In [12]:
(
    passengers_per_airport
    .rename(columns={'arr_port': 'Airport', 'count': 'Total passengers'})
    .head(10)
    .style.hide_index()
)

Airport,Total passengers
LHR,9040
MCO,7223
LAX,7191
LAS,7079
JFK,6788
CDG,6513
BKK,6006
SFO,5929
MIA,5896
DXB,5647


## Cleaned data in chunks

**Plan action**
1. Load cleaned data in chunks 
2. Loop over all chunks and save the result
    - Fill NaNs with 0
    - Covert columns to more suitable dtypes
    - Group by *airport*
    - Sum number of passengers
3. Manipulate the output of the previous step
    - Group by *airport*
    - Get top 10
4. Save the output

### Step 1: Load cleaned data

In [13]:
bookings_iter2 = pd.read_csv(
    path_cleaned_bookings,
    compression='bz2',
    engine='c',
    chunksize=50000,
    dtype={"arr_port": "string", "pax": 'Int64'},
    usecols=["arr_port", "pax"],
    delimiter="^",
)

### Step 2: groupby and sum within chunks

In [14]:
passengers_per_airport2 = pd.DataFrame()

for bookings_chunk in bookings_iter2:
    grp = (
        bookings_chunk
        .fillna(0)
        .convert_dtypes()
        .rename(columns=str.strip)
        .convert_dtypes()
        .groupby(["arr_port"])
        .agg(count=("pax", np.sum))
        .reset_index()
    )

    passengers_per_airport2 = passengers_per_airport2.append(grp)

### Step 3

In [15]:
top10 = (
    passengers_per_airport2
    .groupby(['arr_port'])
    .agg(count=("count", np.sum))
    .nlargest(10, 'count')
    .reset_index()
    .rename(columns={'arr_port': 'Airport', 'count': 'Total passengers'})
)
top10

Unnamed: 0,Airport,Total passengers
0,LHR,9040
1,MCO,7223
2,LAX,7191
3,LAS,7079
4,JFK,6788
5,CDG,6513
6,BKK,6006
7,SFO,5929
8,MIA,5896
9,DXB,5647


In [16]:
top10.to_csv('top10_airports_per_passengers.csv', index=False)