In [35]:
import os
import numpy as np
import pandas as pd

In [36]:
def read_zip(zip_path):
    
    import zipfile
    import io
    
    data_all = []
    
    with zipfile.ZipFile(zip_path) as zf:
        filename_li = zf.namelist()
        filename_li = list(filter(lambda x : x.endswith('.csv'), filename_li))
        
        for filename in filename_li:
            s = zf.read(filename).decode("utf-8") 
            data = pd.read_csv(io.StringIO(s))
    
            data_all.append(data)
    
    data_all = pd.concat(data_all, axis=0, ignore_index=True)
    
    return data_all
# ===================================================================== 

In [37]:
zip_path = r'data/airline_cn_20240313_20140324.zip'
data = read_zip(zip_path)


data = data[['callsign', 'destination_airport_iata', 'origin_airport_iata']] \
           .dropna(ignore_index=True) \
           .assign(od_iata = lambda x : x['origin_airport_iata'].str.cat(x['destination_airport_iata'], sep='-')) \
           .drop(['origin_airport_iata', 'destination_airport_iata'], axis=1)
print(data.shape)


# check the criteria of 'callsign', only used for CN airline
icao_is_three_letters = data['callsign'].str[:3].str.isalpha()
icao_is_numbers       = data['callsign'].str[3:].str.isnumeric()
icao_is_len_five      = data['callsign'].str.len() >= 5
print('The first three letters of callsign number are alpha letters:', icao_is_three_letters.value_counts().to_dict(), '\n',
      'The charateristics of callsign number from fourth onward are digtial number:', icao_is_numbers.value_counts().to_dict(), '\n',
      'The length of callsign number is no less than five:', icao_is_len_five.value_counts().to_dict(), '\n')

# data = data[icao_is_three_letters & icao_is_numbers & icao_is_len_five]
print(data.shape)

(390327, 2)
The first three letters of callsign number are alpha letters: {True: 390311, False: 16} 
 The charateristics of callsign number from fourth onward are digtial number: {True: 390185, False: 142} 
 The length of callsign number is no less than five: {True: 390327} 

(390327, 2)


In [38]:
# save airline od

data.groupby('callsign').agg(lambda x: pd.Series.mode(x)[0]) \
    .to_csv('data/airline_cn_od.csv')