In [98]:
import os
import time

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt

In [99]:
def check_code_info(data):
     
    print('Data shape:',  data.shape, '\n',
          'Country: \n',
          '  Number of non-nan records:', data['country_code'].dropna().shape, '\n',
          '  Number of unique records:',  data['country_code'].dropna().drop_duplicates().shape, '\n',
          'IATA number: \n',
          '  Number of non-nan records:', data['iata'].dropna().shape, '\n',
          '  Number of unique records:',  data['iata'].dropna().drop_duplicates().shape, '\n',
          'ICAO number: \n',
          '  Number of non-nan records:', data['icao'].dropna().shape, '\n',
          '  Number of unique records:',  data['icao'].dropna().drop_duplicates().shape, '\n'
         )

def conver_to_geo_point(data):
    gdf = gpd.GeoDataFrame(data, 
                           geometry = gpd.points_from_xy(data['lon'], data['lat']),
                           crs = 'EPSG:4326'
                          )
    return gdf

# 1. Read data

In [100]:
# Dataset 1
path = 'https://raw.githubusercontent.com/ip2location/ip2location-iata-icao/master/iata-icao.csv'
airport_1 = pd.read_csv(path) \
              .rename(columns={'latitude':'lat', 'longitude':'lon'}) \
              .assign(iata = lambda x : x['iata'].replace(['', '0', 'NA', pd.NA, None], np.nan))

print('Data shape:', airport_1.shape)


# Dataset 2
path = 'https://raw.githubusercontent.com/mwgg/Airports/master/airports.json'
airport_2 = pd.read_json(path, orient='index') \
              .rename(columns={'country':'country_code'}) \
              .assign(iata = lambda x : x['iata'].replace(['', '0', 'NA', pd.NA, None], np.nan))
print('Data shape:', airport_2.shape)

Data shape: (8936, 7)
Data shape: (28898, 10)


In [101]:
airport_cn_1 = airport_1.dropna(subset='iata')
airport_cn_1 = airport_1[airport_1['country_code'].isin(['CN', 'MO', 'HK'])]

check_code_info(airport_cn_1)


airport_cn_2 = airport_2.dropna(subset='iata')
airport_cn_2 = airport_cn_2[airport_cn_2['country_code'].isin(['CN', 'MO', 'HK'])]

check_code_info(airport_cn_2)

Data shape: (266, 7) 
 Country: 
   Number of non-nan records: (266,) 
   Number of unique records: (3,) 
 IATA number: 
   Number of non-nan records: (266,) 
   Number of unique records: (266,) 
 ICAO number: 
   Number of non-nan records: (246,) 
   Number of unique records: (246,) 

Data shape: (185, 10) 
 Country: 
   Number of non-nan records: (185,) 
   Number of unique records: (3,) 
 IATA number: 
   Number of non-nan records: (185,) 
   Number of unique records: (185,) 
 ICAO number: 
   Number of non-nan records: (185,) 
   Number of unique records: (185,) 



In [102]:
# difference 
diff_iata_1 = list(set(airport_cn_1['iata']).difference(set(airport_cn_2['iata'])))
diff_iata_2 = list(set(airport_cn_2['iata']).difference(set(airport_cn_1['iata'])))

airport_cn_1.set_index('iata').reindex(diff_iata_1)

Unnamed: 0_level_0,country_code,region_name,icao,airport,lat,lon
iata,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CQW,CN,Chongqing Shi,ZUWL,Chongqing Xiannyushan Airport,29.4658,107.692
BAR,CN,Hainan,,Qionghai Bo'ao Airport,19.1382,110.455
FYJ,CN,Heilongjiang,ZYFY,Fuyuan Dongji Airport,48.1995,134.366
PZI,CN,Sichuan,ZUZH,Panzhihua Bao'anying Airport,26.5400,101.799
HBQ,CN,Qinghai Sheng,ZLHB,Haibei Qilian Airport,38.0120,100.644
...,...,...,...,...,...,...
HSC,CN,Guangdong,,Shaoguan Guitou Airport,24.9786,113.421
NZL,CN,Nei Mongol,,Zhalantun Chengjisihan Airport,47.8658,122.768
RHT,CN,Nei Mongol,,Alxa Right Banner Badanjilin Airport,39.2250,101.546
WDS,CN,Hubei,ZHSY,Shiyan Wudangshan Airport,32.5917,110.908


In [103]:
airport_cn_2.set_index('iata').reindex(diff_iata_2)

Unnamed: 0_level_0,icao,name,city,state,country_code,elevation,lat,lon,tz
iata,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WHU,ZSWU,Wuhu Air Base,Wuhu,Anhui,CN,0,31.3906,118.408997,Asia/Shanghai
NAY,ZBNY,Beijing Nanyuan Airport,Beijing,Beijing,CN,0,39.782799,116.388,Asia/Shanghai
PKX,ZBAD,Beijing Daxing International Airport,Beijing,Beijing,CN,98,39.509167,116.410556,Asia/Shanghai
SHP,ZBSH,Shanhaiguan Airport,Qinhuangdao,Hebei,CN,30,39.968102,119.731003,Asia/Shanghai


# Manual Addition

In [104]:
new_airport = [
    {'country_code' : 'CN', 
     'region_name'  : 'Beijing', 
     'iata'         : 'PKX', 
     'icao'         : 'ZBAD', 
     'airport'      : 'Beijing Daxing International Airport',
     'lat'          : 39.509167, 
     'lon'          : 116.410556  }
]


airport_cn = pd.concat([airport_cn_1, pd.DataFrame(new_airport)], 
                          ignore_index=True) \
                 .sort_values(['country_code', 'region_name', 'iata', 'icao'])


check_code_info(airport_cn)

Data shape: (267, 7) 
 Country: 
   Number of non-nan records: (267,) 
   Number of unique records: (3,) 
 IATA number: 
   Number of non-nan records: (267,) 
   Number of unique records: (267,) 
 ICAO number: 
   Number of non-nan records: (247,) 
   Number of unique records: (247,) 



# Save data

In [105]:
airport_cn = pd.concat([airport_cn_1, 
                        airport_cn_2.rename(columns={'name'  : 'airport',
                                                     'state' : 'region_name'})], 
                       ignore_index=True) \
               .drop_duplicates(subset='iata', keep='first') \
               .sort_values(['country_code', 'region_name', 'iata']) \
                [['country_code', 'region_name', 'iata', 'icao', 'airport', 'lat', 'lon']] 

airport_cn.to_csv('processed_data/cn_airport.csv', index=False, encoding='utf-8')

In [106]:
gpd.GeoDataFrame(airport_cn,
                 geometry = gpd.points_from_xy(airport_cn['lon'], airport_cn['lat']),
                 crs = 'EPSG:4326') \
    .to_file('processed_data/cn_airport_location', encoding='utf-8')

  gpd.GeoDataFrame(airport_cn,
