In [1]:
import os
os.chdir("../..")
import pandas as pd
from scripts.PdfParse import *

In [2]:
solomon_folder = os.getcwd() + "/data/tourism/solomon/"
solomon_pdfs = [solomon_folder + file
                for file in os.listdir(solomon_folder) if ".pdf" in file]

## Official Statistics
### Parsing

In [3]:
solomon_2020s = [pdf for pdf in solomon_pdfs if "2020" in pdf]

In [4]:
# Read the pdf file
df = load_pdf(filepath=solomon_2020s[1],
              search_string="Table 3",
              table_page=8)
df = df.iloc[:13, ].dropna(how="all", axis=1).dropna(thresh=3, axis=0)
df.head(5)

Unnamed: 0,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NUMBER,NaN.6,NaN.7,NaN.8,NaN.9,NaN.10
0,January,1602,1235,1600,1383,1590,"1,415 1,259",1655,1514,2081,1750,1857
1,February,1422,1209,1658,1591,1544,"1,523 1,465",1707,1635,1855,1891,1471
2,March,1249,1766,1861,1677,2184,"1,816 1,675",1675,1732,2360,2378,752
3,April,1499,1820,1736,1839,2021,"1,514 1,750",1799,2013,2250,2106,2
4,May,1393,1137,1760,1968,1857,"1,462 1,681",1896,1851,2003,2434,23


In [5]:
yr_range = len(df.columns) - 1 + 1  # drop month column and include splitted
yr_lst = [i for i in range(2020 - yr_range + 1, 2020 + 1)]

for colname in df.columns:
    if type(colname) == str and len(colname) > 4 and colname != "Month":
        break

col_idx = df.columns.to_list().index(colname)

In [6]:
## Expand the column and keep the sequence by
## creating before/splitted/after dataframe
df_precol, df_postcol = df.iloc[:, :col_idx],  df.iloc[:, col_idx+1:]

precol_lst = df_precol.columns.to_list()
precol_lst[0], precol_lst[1:] = "Month", yr_lst[:col_idx-1]
df_precol.columns = precol_lst
df_postcol.columns = yr_lst[-len(df_postcol.columns):]

splitted = df.iloc[:, col_idx].str.split(" ", expand=True)
splitted.columns = yr_lst[col_idx-1: -col_idx+1]

In [7]:
temp_df = pd.concat([df_precol, splitted, df_postcol], axis=1)
temp_df = remove_separator(temp_df)


for col in temp_df.columns[1:]:
    temp_df[col] = temp_df[col].astype(int)

temp_df

Unnamed: 0,Month,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,January,1602,1235,1600,1383,1590,1415,1259,1655,1514,2081,1750,1857
1,February,1422,1209,1658,1591,1544,1523,1465,1707,1635,1855,1891,1471
2,March,1249,1766,1861,1677,2184,1816,1675,1675,1732,2360,2378,752
3,April,1499,1820,1736,1839,2021,1514,1750,1799,2013,2250,2106,2
4,May,1393,1137,1760,1968,1857,1462,1681,1896,1851,2003,2434,23
5,June,1724,1797,2009,3122,2317,1949,2131,1984,2561,2768,2523,24
6,July,1603,2264,2185,2865,2574,2114,2077,2319,2295,2862,2537,52
7,August,1520,1845,1900,2133,2022,1639,1426,1918,2589,2313,2484,39
8,September,1659,2153,2159,2309,2291,1730,2211,1428,2397,2663,2695,103
9,October,1709,1813,1931,1929,2057,1798,1748,2257,2500,2366,2761,60


In [8]:
temp_df_tr = temp_df.T
temp_df_tr.columns = temp_df_tr.iloc[0].to_list()
temp_df_tr = (temp_df_tr
              .drop(index="Month")
              .reset_index()
              .rename({"index": "Year"}, axis=1))

temp_df_tr.to_csv(
    solomon_folder + "visitor_by_month_2009-20.csv", encoding="utf-8")

In [9]:
df_19_20 = temp_df.iloc[:-1, -2:].unstack().reset_index().rename({0: "Total"}, axis=1)
df_19_20

Unnamed: 0,level_0,level_1,Total
0,2019,0,1750
1,2019,1,1891
2,2019,2,2378
3,2019,3,2106
4,2019,4,2434
5,2019,5,2523
6,2019,6,2537
7,2019,7,2484
8,2019,8,2695
9,2019,9,2761


In [10]:
print(check_quality(temp_df_tr, ["Year"], "Total"))

[6]


## Aviation Statistics

In [11]:
aviation_path = os.getcwd() + "/data/tourism/aviation_seats_flights_pic.xlsx"
aviation = pd.read_excel(aviation_path)
aviation.head(5)

Unnamed: 0,Country,ISO,Region,Date,Aircraft_type,Seats_arrivals_domestic,Seats_arrivals_interregional,Seats_arrivals_intraregional,Seats_arrivals_intl,Seats_arrivals_total,Available_seat_kilometers,Number_of_flights_domestic,Number_of_flights_interregional,Number_of_flights_intraregional,Number_of_flights_intl,Number_of_flights_total
0,Fiji,FJ,East Asia & Pacific,2019-01-01,passenger,839,273,3480,3753,4592,14304160.0,8,1,10,11,19
1,Fiji,FJ,East Asia & Pacific,2019-01-02,passenger,974,313,3471,3784,4758,14956100.0,8,1,10,11,19
2,Fiji,FJ,East Asia & Pacific,2019-01-03,passenger,1190,443,3675,4118,5308,15921430.0,10,2,12,14,24
3,Fiji,FJ,East Asia & Pacific,2019-01-04,passenger,831,586,3159,3745,4576,14573340.0,7,2,12,14,21
4,Fiji,FJ,East Asia & Pacific,2019-01-05,passenger,744,273,4752,5025,5769,17734490.0,7,1,12,13,20


In [12]:
sb_avi = (aviation[(aviation.ISO == "SB") & (aviation.Aircraft_type == "passenger")]
          .reset_index()
          .drop("index", axis=1)
          [["Date", "Number_of_flights_intl", "Number_of_flights_total",
            "Seats_arrivals_intl", "Seats_arrivals_total"]])

dates = pd.DataFrame(pd.date_range(start="2019-01-01",
                                   end="2020-12-31"), columns=["Date"])

sb_avi = dates.merge(sb_avi, how="left", on="Date")
sb_avi["Date"] = pd.to_datetime(sb_avi["Date"])
sb_avi["Month"], sb_avi["Year"] = sb_avi["Date"].dt.month, sb_avi["Date"].dt.year
sb_avi_19_20 = sb_avi.groupby(by=["Year", "Month"]).sum().reset_index()
sb_avi_19_20.head(5)

Unnamed: 0,Year,Month,Number_of_flights_intl,Number_of_flights_total,Seats_arrivals_intl,Seats_arrivals_total
0,2019,1,69.0,69.0,10036.0,10036.0
1,2019,2,66.0,66.0,9471.0,9471.0
2,2019,3,69.0,71.0,9693.0,9965.0
3,2019,4,66.0,71.0,9532.0,10212.0
4,2019,5,69.0,73.0,9859.0,10403.0


In [14]:
temp_corr = pd.concat([df_19_20, sb_avi_19_20], axis=1).drop(["level_0", "level_1"], axis=1)
temp_corr = remove_separator(temp_corr)
temp_corr["Total"] = temp_corr["Total"].astype(float)

In [15]:
from scipy.stats import pearsonr

corr_seats, _ = pearsonr(temp_corr["Seats_arrivals_total"], temp_corr["Total"])
corr_seat_flight, _ = pearsonr(temp_corr["Number_of_flights_total"], temp_corr["Total"])
print(f"Pearson Correlation between FlightRadar's Seats Arrival and VU's census data is{corr_seats: .4f}.\n",
f"Pearson Correlation between FlightRadar's # of Flights Arrival and VU's census data is{corr_seat_flight: .4f}.")

Pearson Correlation between FlightRadar's Seats Arrival and VU's census data is 0.9521.
 Pearson Correlation between FlightRadar's # of Flights Arrival and VU's census data is 0.9478.
