# Exploratory Data Analysis 

In [1]:
import os
os.chdir("../../")
import pandas as pd
import numpy as np
import seaborn as sns

from scripts.python.PdfParse import *
from scripts.python.ts_utils import *
from scripts.python.utils import *

## Official Data

In [2]:
folder_path = os.getcwd() + "/data/tourism/palau/"
palau = (pd.read_csv(folder_path + "intermediate/palau_monthly_visitor.csv")
           .drop("Unnamed: 0", axis=1))
palau["Date"] = pd.to_datetime(palau["Date"])
palau.head(5)

Unnamed: 0,Date,JAPAN,SOUTH KOREA,TAIWAN,CHINA,USA/CANADA,EUROPE,OTHERS,Total
0,2007-06-01,856.0,1291.0,3245.0,86.0,669.0,99.0,463.0,6709.0
1,2007-07-01,2119.0,1366.0,3269.0,33.0,653.0,144.0,437.0,8021.0
2,2007-08-01,3476.0,1354.0,3046.0,46.0,580.0,256.0,438.0,9196.0
3,2007-09-01,3022.0,910.0,2497.0,61.0,559.0,145.0,401.0,7595.0
4,2007-10-01,1807.0,1082.0,2298.0,49.0,774.0,390.0,395.0,6795.0


In [6]:
from bokeh.palettes import Category20
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Legend
output_notebook()

p = figure(height=500, width=780,
           title="Number of Passengers Per Month",
           x_axis_type="datetime",
           x_axis_label="Date",
           y_axis_label="Number of Passengers Per Month",
           tooltips=[("Country", "$name"),
                     ("Passenger per month", "@$name")])

countries = palau.columns[~palau.columns.isin(["Date", "Total"])].to_list()
p.add_layout(Legend(), 'right')
source = ColumnDataSource(palau)

for country, color in zip(countries, Category20[12]):
    # have to use different colnames for y-coords so tooltip can refer to @$name
    p.line('Date', country, source=source, name=country,
           legend_label=country, color=color)

p.legend.label_text_font_size = '9pt'
p.legend.click_policy = "mute"
p.legend.location = "top_left"

show(p)

In [4]:
get_adf_df(palau, ["Total"])

Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
Total,-1.230261,0.660444,13.0,170.0,-3.469413,-2.878696,-2.575917


## Aviation Data

In [5]:
palau_merged = (pd.read_csv(folder_path + "intermediate/palau_merged.csv")
                  .drop("Unnamed: 0", axis=1))
palau_merged.head(5)

Unnamed: 0,Year,Month,Seats_arrivals_intl,Seats_arrivals_total,Number_of_flights_intl,Number_of_flights_total,Date,JAPAN,SOUTH KOREA,TAIWAN,CHINA,USA/CANADA,EUROPE,OTHERS,Total
0,2019,1,13048.0,13048.0,75.0,75.0,2019-01-01,1953.0,1169.0,919.0,2072.0,626.0,310.0,280.0,7329.0
1,2019,2,13281.0,13281.0,74.0,74.0,2019-02-01,2055.0,1035.0,1092.0,4059.0,702.0,438.0,371.0,9752.0
2,2019,3,12870.0,12870.0,73.0,73.0,2019-03-01,2434.0,1090.0,1190.0,2549.0,826.0,483.0,461.0,9033.0
3,2019,4,10806.0,10806.0,64.0,64.0,2019-04-01,1756.0,808.0,1099.0,3182.0,777.0,368.0,415.0,8405.0
4,2019,5,11472.0,11472.0,67.0,67.0,2019-05-01,1288.0,1095.0,1393.0,2489.0,607.0,185.0,421.0,7478.0
