In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
os.chdir("../")

## Wrangling

In [2]:
folder_path = os.path.dirname(os.getcwd()) + "/data/tourism/palau/"
file_path = folder_path + os.listdir(folder_path)[0]

In [3]:
palau = pd.read_excel(file_path, sheet_name="TabMth")
palau = palau.iloc[0:-1].reset_index().drop("index", axis=1)
palau.columns = palau.iloc[0]
palau = palau.iloc[1:].T
palau = palau.reset_index()
palau.columns = palau.iloc[0]
palau = (palau.rename({"Country Group": "Date"}, axis=1)
         .iloc[1:]
         .reset_index()
         .drop("index", axis=1))

In [4]:
# Change the data format
palau["Date"] = pd.to_datetime(palau["Date"])
for col in palau.columns[~palau.columns.isin(["Date"])]:
    palau[col] = palau[col].astype(float)


palau.to_csv(folder_path + "palau_monthly_visitor.csv",
             encoding="utf-8")
palau.head(5)

Unnamed: 0,Date,JAPAN,SOUTH KOREA,TAIWAN,CHINA,USA/CANADA,EUROPE,OTHERS,Total
0,2007-06-01,856.0,1291.0,3245.0,86.0,669.0,99.0,463.0,6709.0
1,2007-07-01,2119.0,1366.0,3269.0,33.0,653.0,144.0,437.0,8021.0
2,2007-08-01,3476.0,1354.0,3046.0,46.0,580.0,256.0,438.0,9196.0
3,2007-09-01,3022.0,910.0,2497.0,61.0,559.0,145.0,401.0,7595.0
4,2007-10-01,1807.0,1082.0,2298.0,49.0,774.0,390.0,395.0,6795.0


## Bokeh Visualization

In [5]:
from bokeh.palettes import Category20
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Legend


output_file('palau_month_by_country.html')
p = figure(height=600, width=1000,
           title="Number of Passengers Per Month",
           x_axis_type="datetime",
           x_axis_label="Date",
           y_axis_label="Number of Passengers Per Month",
           tooltips=[("Country", "$name"),
                     ("Passenger per month", "@$name")])

countries = palau.columns[~palau.columns.isin(["Date", "Total"])].to_list()
p.add_layout(Legend(), 'right')
source = ColumnDataSource(palau)

for country, color in zip(countries, Category20[12]):
    # have to use different colnames for y-coords so tooltip can refer to @$name
    p.line('Date', country, source=source, name=country,
           legend_label=country, color=color)

p.legend.label_text_font_size = '9pt'
p.legend.click_policy = "mute"
p.legend.location = "top_left"

show(p)

## Correlation

In [6]:
folder = os.path.dirname(os.getcwd()) + "/data/tourism/"
flights = os.path.dirname(os.getcwd()) + "/data/tourism/aviation_seats_flights_pic.xlsx"

In [7]:
df = pd.read_excel(flights)
df = (df[(df.Country == "Palau") & (df.Aircraft_type == "passenger")]
      .reset_index()
      .drop("index", axis=1)
      [["Date", "Seats_arrivals_intl", "Seats_arrivals_total", "Number_of_flights_intl", "Number_of_flights_total"]])

df["Date"] = pd.to_datetime(df["Date"])

range_df = pd.DataFrame(pd.date_range(
    start='2019-01-01', end='2022-10-16'), columns=["Date"])
df = range_df.merge(df, on="Date", how="left")
df["Month"], df["Year"] = df["Date"].dt.month, df["Date"].dt.year

In [8]:
palau_ma = df.groupby(by=["Year", "Month"]).sum()[:-1]
palau_19_22 = palau[palau.Date >=
                    "2019-01-01"].reset_index().drop("index", axis=1)

In [9]:
palau_19_22

Unnamed: 0,Date,JAPAN,SOUTH KOREA,TAIWAN,CHINA,USA/CANADA,EUROPE,OTHERS,Total
0,2019-01-01,1953.0,1169.0,919.0,2072.0,626.0,310.0,280.0,7329.0
1,2019-02-01,2055.0,1035.0,1092.0,4059.0,702.0,438.0,371.0,9752.0
2,2019-03-01,2434.0,1090.0,1190.0,2549.0,826.0,483.0,461.0,9033.0
3,2019-04-01,1756.0,808.0,1099.0,3182.0,777.0,368.0,415.0,8405.0
4,2019-05-01,1288.0,1095.0,1393.0,2489.0,607.0,185.0,421.0,7478.0
5,2019-06-01,650.0,655.0,1399.0,2321.0,622.0,119.0,305.0,6071.0
6,2019-07-01,1090.0,773.0,1697.0,2319.0,724.0,163.0,288.0,7054.0
7,2019-08-01,2117.0,1058.0,1664.0,2580.0,535.0,183.0,381.0,8518.0
8,2019-09-01,1467.0,657.0,1253.0,2017.0,643.0,165.0,512.0,6714.0
9,2019-10-01,918.0,698.0,1300.0,1904.0,716.0,276.0,362.0,6174.0


In [14]:
from scipy.stats import pearsonr

corr_seats, _ = pearsonr(palau_ma["Seats_arrivals_total"], palau_19_22["Total"])
corr_seat_flight, _ = pearsonr(palau_ma["Number_of_flights_total"], palau_19_22["Total"])
print(f"Pearson Correlation between FlightRadar's Seats Arrival and Palau's census data is{corr_seats: .4f}.", "\n", 
    f"Pearson Correlation between FlightRadar's # Of Flights Arrival and Palau's census data is{corr_seat_flight: .4f}.")

Pearson Correlation between FlightRadar's Seats Arrival and Palau's census data is 0.9710. 
 Pearson Correlation between FlightRadar's # Of Flights Arrival and Palau's census data is 0.9699.


In [10]:
palau_ma_cor = palau_ma[["Seats_arrivals_total", "Number_of_flights_total"]].reset_index().drop([
    "Year", "Month"], axis=1)
viz_df = pd.concat([palau_ma_cor, palau_19_22["Total"]], axis=1)

In [12]:
matrix = viz_df.corr(method='pearson')
matrix

Unnamed: 0,Seats_arrivals_total,Number_of_flights_total,Total
Seats_arrivals_total,1.0,0.997722,0.970974
Number_of_flights_total,0.997722,1.0,0.96986
Total,0.970974,0.96986,1.0
