In [1]:
# import important libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load data into dataframe
df = pd.read_csv("airport-codes_csv.csv")

In [14]:
# Check data types of columns and shape of data frame
print(" Data types:\n",df.dtypes, "\n \nShape of Dataframe: ",df.shape)

 Data types:
 ident            object
type             object
name             object
elevation_ft    float64
continent        object
iso_country      object
iso_region       object
municipality     object
gps_code         object
iata_code        object
local_code       object
coordinates      object
dtype: object 
 
Shape of Dataframe:  (55075, 12)


In [15]:
# Display top 5 rows
df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [3]:
# Dropping irrelevant columns
df.drop(columns=['local_code', 'gps_code'], inplace=True)

In [17]:
# Check for duplicate rows 
# Also verify that ident column has no null values and contains only unique values, as this column may be used as primary key. 
duplicate_rows_df = df[df.duplicated()]
#df = df.drop_duplicates()
x = df.ident.isnull().sum()
y = df.ident.nunique()
x, y
print(" number of duplicate rows: ", duplicate_rows_df.size, "\n number of unique ident: ", y, "\n number of nulls in ident: ", x)

 number of duplicate rows:  0 
 number of unique ident:  55075 
 number of nulls in ident:  0


In [4]:
#renaming column municipality to city for ease of use 
df.rename(columns={'municipality':'city'}, inplace=True)

In [6]:
# Dropping the missing or null values.
# Since our focus will be on US immigration records with port of entry, we will drop rows where iata_code is Nan
df = df.loc[df.iata_code.notnull()]
df.shape

(9189, 10)

In [24]:
#Expand concatenated columns
# iso_region and coordinates are concatenated and will be expanded

# iso_country has 31 Nan values. however iso_region has the country specified AS 'NA' which is country code for Namibia. Hence we will expand iso_region to get columns
# 'country' and 'region' and then drop iso_country column
df.loc[df.iso_country.isnull()]

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,city,iata_code,coordinates
21422,FYAA,small_airport,Ai-Ais Airport,2000.0,AF,,NA-KU,Ai-Ais,AIW,"17.5966, -27.995"
21426,FYAR,medium_airport,Arandis Airport,1905.0,AF,,NA-ER,Arandis,ADI,"14.979999542236328, -22.462200164794922"
21433,FYGB,small_airport,Gobabis Airport,4731.0,AF,,NA-OH,Gobabis,GOG,"18.973100662231445, -22.5044002532959"
21435,FYGF,medium_airport,Grootfontein Airport,4636.0,AF,,NA-OD,Grootfontein,GFY,"18.122699737548828, -19.60219955444336"
21440,FYHI,small_airport,Halali Airport,3639.0,AF,,NA-OT,Halali,HAL,"16.4585, -19.0285"
21444,FYKB,small_airport,Karasburg Airport,3275.0,AF,,NA-KA,Karasburg,KAS,"18.7385, -28.0297"
21448,FYKM,small_airport,Katima Mulilo Airport,3144.0,AF,,NA-CA,Mpacha,MPA,"24.176701, -17.634399"
21449,FYKT,medium_airport,Keetmanshoop Airport,3506.0,AF,,NA-KA,Keetmanshoop,KMP,"18.111400604248047, -26.5398006439209"
21450,FYLS,small_airport,Lianshulu Airport,3143.0,AF,,NA-CA,Lianshulu Lodge,LHU,"23.393299102800004, -18.116699218799997"
21451,FYLZ,medium_airport,Luderitz Airport,457.0,AF,,NA-KA,Luderitz,LUD,"15.242899894714355, -26.687400817871094"


In [7]:
# Adding country, region columns to DataFrame
df[["country","region"]] = df.iso_region.str.split('-', expand=True, n=1)

# Adding latitude, longitude columns to DataFrame
df[["latitude","longitude"]] = df.coordinates.str.split(',', expand=True, n=1)

df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,city,iata_code,coordinates,country,region,latitude,longitude
223,03N,small_airport,Utirik Airport,4.0,OC,MH,MH-UTI,Utirik Island,UTK,"169.852005, 11.222",MH,UTI,169.852005,11.222
440,07FA,small_airport,Ocean Reef Club Airport,8.0,,US,US-FL,Key Largo,OCA,"-80.274803161621, 25.325399398804",US,FL,-80.274803161621,25.325399398804
594,0AK,small_airport,Pilot Station Airport,305.0,,US,US-AK,Pilot Station,PQS,"-162.899994, 61.934601",US,AK,-162.899994,61.934601
673,0CO2,small_airport,Crested Butte Airpark,8980.0,,US,US-CO,Crested Butte,CSE,"-106.928341, 38.851918",US,CO,-106.928341,38.851918
1088,0TE7,small_airport,LBJ Ranch Airport,1515.0,,US,US-TX,Johnson City,JCY,"-98.62249755859999, 30.251800537100003",US,TX,-98.6224975586,30.251800537100003


In [8]:
# Drop column iso_country, iso_region, coordinates
df.drop(columns=['iso_country', 'iso_region','coordinates'], inplace=True)