In [114]:
import chardet
import numpy as np
import pandas as pd
import datetime
import codecs

import progressbar
from time import sleep

import matplotlib.pyplot as plt

In [115]:
filename = './data/reservations2018.csv'

In [116]:
# attempt to load the data using various codecs
types_of_encoding = ["utf8", "cp1252"]
for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = encoding_type, errors ='replace') as csvfile:
        rez = pd.read_csv(csvfile)

  interactivity=interactivity, compiler=compiler, result=result)


In [103]:
# function to format the imported data, drop all categories of reservations except campsites
def nps_site_format(df):
    # get only reservations from the National Park Service, OrdID of 128
    df = df[df['OrgID'] == 128]
    
    # get only site type entities
    df = df[df['EntityType'] == 'Site']
    
    # convert date columns to datetime, y-m-d
    df['EndDate'] = pd.to_datetime(df['EndDate'], format= '%Y-%m-%d')
    df['StartDate'] = pd.to_datetime(df['StartDate'], format= '%Y-%m-%d')
    df['OrderDate'] = pd.to_datetime(df['OrderDate'], format= '%Y-%m-%d')
    
    # calculate the stay length of the reservation
    df['StayLen'] = df['EndDate'] - df['StartDate']
    df['StayLen'] = df['StayLen'].dt.days
    
    # calculate the days in advance the resveration was made
    # note: some reservations were recorded after the start date, those values are imputed as zero
    df['BookingHorizon'] = df['StartDate'] - df['OrderDate']
    df['BookingHorizon'] = df['BookingHorizon'].dt.days.clip(lower=0)
    
    # drop duplicate order numbers
    df = df.drop_duplicates('OrderNumber')
    
    return df

In [122]:
# format imported data
rez = nps_site_format(rez)

In [134]:
rez.columns

Index(['HistoricalReservationID', 'OrderNumber', 'Agency', 'OrgID',
       'CodeHierarchy', 'RegionCode', 'RegionDescription', 'ParentLocationID',
       'ParentLocation', 'LegacyFacilityID', 'Park', 'SiteType', 'UseType',
       'ProductID', 'EntityType', 'EntityID', 'FacilityID', 'FacilityZIP',
       'FacilityState', 'FacilityLongitude', 'FacilityLatitude', 'CustomerZIP',
       'CustomerState', 'CustomerCountry', 'Tax', 'UseFee', 'TranFee',
       'AttrFee', 'TotalBeforeTax', 'TotalPaid', 'StartDate', 'EndDate',
       'OrderDate', 'NumberOfPeople', 'Tent', 'Popup', 'Trailer',
       'RVMotorhome', 'Boat', 'HorseTrailer', 'Car', 'FifthWheel', 'Van',
       'CanoeKayak', 'BoatTrailer', 'Motorcycle', 'Truck', 'Bus', 'Bicycle',
       'Snowmobile', 'OffRoadlAllTerrainVehicle', 'PowerBoat', 'PickupCamper',
       'LargeTentOver9x12', 'SmallTent', 'Marinaboat', 'LatLongPoint',
       'StayLen', 'BookingHorizon'],
      dtype='object')

In [136]:
rez['RegionCode'].unique()

array(['SHEN-4840', 'BISO-5130', 'YOSE-8800', 'SEKI-8557', 'JOTR-8330',
       'GREE-3240', 'GRSM-5460', 'GRCA-8210', 'GOGA-8140', 'CHIS-8120',
       'BUFF-7150', 'ROMO-1520', 'CHIC-7510', 'CATO-3221', 'BLRI-5140',
       'CALO-5210', 'LAVO-8400', 'PORE-8530', 'PRWI-3701', 'BAND-7120',
       'THRO-1540', 'CANY-1346', 'CURE-1379', 'ARCH-1348', 'NOCA-9470',
       'LARO-9260', 'ANTI', 'GETT-4400', 'SEKI-8550', 'CHOH', 'BICY-5120',
       'CRMO-9280', 'LACL', 'ORPI-8660', 'WHIS-8750', 'ROCR', 'BLCA-1377',
       'MACA-5530', 'ASIS-4190', 'OZAR-6640', 'GRSA-1470', 'CHCU-7400',
       'ZION-1590', 'CAHA-5190', 'BIBE-7130', 'OLYM-9500', 'CHIR-8620',
       'GUIS-5320', 'CATO-3200', 'ACAD-1700', 'CARE-1350', 'CEBR-1360',
       'NAMA', 'OBRI', 'PINN-8450', 'DEVA-8130', 'SLBE-6620', 'COLM-1378',
       'DINO-1400', 'FOHU-3331', 'MORA-9450', 'GLAC-1430', 'GATE-1170',
       'BRCA-1330', 'GRBA-8420', 'KEFJ-9845', 'SAFR-8520', 'FOWA-3550',
       'CONG-5240', 'SAMO-8540', 'NACE', 'CUVA-6160', '

## Testing Fields
---

In [138]:
datelist = pd.date_range(rez['StartDate'].min(), rez['EndDate'].max()).tolist()
print(datelist[214].date())
jan_1 = datelist[214]
jan_1_filter = (rez['StartDate'] <= jan_1) & (rez['EndDate'] >= jan_1)

2018-01-01


In [141]:
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]

In [143]:
date_park_list = []
bar = progressbar.ProgressBar(maxval=test_len, widgets=widgets)

bar.start()
i = 0
for index, row in rez[jan_1_filter].groupby(['Park', 'RegionDescription']).nunique().iterrows():
    stay_len = rez[jan_1_filter].groupby(['Park', 'RegionDescription']).mean().loc[index[0], 'StayLen'][0]
    book_hor = rez[jan_1_filter].groupby(['Park', 'RegionDescription']).mean().loc[index[0], 'BookingHorizon'][0]
    num_people = rez[jan_1_filter].groupby(['Park', 'RegionDescription']).sum().loc[index[0], 'NumberOfPeople'][0]
    
    date_park_list.append([jan_1, index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor])
    bar.update(i+1)
    sleep(0.0001)
    i += 1
bar.finish()



In [144]:
pd.DataFrame(date_park_list, columns = ['Date', 'Site', 'Park', 'Reservations', 'NumberOfPeople', 'AvgStayLen', 'AvgBookingHorizon'])

Unnamed: 0,Date,Site,Park,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon
0,2018-01-01,Adirondack Shelters,Catoctin Mountain Park,2,7.0,1.5,27.5
1,2018-01-01,Anacapa Island (CA),Channel Islands National Park,3,8.0,1.0,14.333333
2,2018-01-01,BLACK ROCK EQUESTRIAN CAMPGROUND,Joshua Tree National Park,6,29.0,2.333333,20.166667
3,2018-01-01,Black Rock (CA),Joshua Tree National Park,119,452.0,1.966387,40.941176
4,2018-01-01,Bonita Canyon Campground,Chiricahua National Monument,29,70.0,2.172414,16.758621
5,2018-01-01,Burns Lake Campground,Big Cypress National Preserve,11,50.0,3.0,29.909091
6,2018-01-01,CHISOS BASIN GROUP CAMPGROUND (TX),Big Bend National Park,4,37.0,2.75,35.75
7,2018-01-01,"CONGAREE NATIONAL PARK CAMPING, SC",CONGAREE NATIONAL PARK,5,8.0,3.8,0.8
8,2018-01-01,Chaco Culture National Historical Park - Gallo...,Chaco Culture National Historic Park,3,7.0,1.0,1.666667
9,2018-01-01,Chisos Basin (TX) Big Bend,Big Bend National Park,10,28.0,1.6,27.4


## Site Aggregation for Modeling
---

In [75]:
def nps_site_aggregator(df):
    
    # establish list for each site's daily stats
    site_list = []
    
    # create list of days to run through
    datelist = pd.date_range(df['StartDate'].min(), df['EndDate'].max()).tolist()
    
    # create progress bar object
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]
    
    bar = progressbar.ProgressBar(maxval=len(datelist), widgets=widgets)
    
    bar.start()
    count = 0
    
    # cycle through all days
    for date in datelist:
        
        #create a mask to find only reservations that inlcude the specified date
        date_mask = (df['StartDate'] <= date) & (df['EndDate'] >= date)
        
        # cycle through all resvervations, as grouped by the site and the park
        for index, row in df[date_mask].groupby(['Park', 'RegionDescription']).nunique().iterrows():
            
            # get the average stay length and booking horizon for the reservations for the date
            stay_len = df[date_mask].groupby(['Park', 'RegionDescription']).mean().loc[index[0], 'StayLen'][0]
            book_hor = df[date_mask].groupby(['Park', 'RegionDescription']).mean().loc[index[0], 'BookingHorizon'][0]
            
            # get the total number of people per the grouped reservations
            num_people = df[date_mask].groupby(['Park', 'RegionDescription']).sum().loc[index[0], 'NumberOfPeople'][0]
            
            # add daily reservation information to the list
            site_list.append([date.date(), index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor])
            
        
        count += 1
        bar.update(count)
    
    bar.finish()
    return pd.DataFrame(site_list, columns = ['Date', 'Site', 'Park', 'Reservations', 'AvgStayLen', 'AvgBookingHorizon'])

In [76]:
# run site aggregator
# note: this may take a while depending on the size and parameters of the dataframe
rez_18 = nps_site_aggregator(rez)



In [137]:
rez_18[rez_18['Park'].str.contains('Acadia')]

Unnamed: 0,Date,Site,Park,Reservations,AvgStayLen,AvgBookingHorizon
3,2017-07-14,Schoodic Woods Campground,Acadia National Park,1,2.000000,-97.000000
4,2017-07-15,Schoodic Woods Campground,Acadia National Park,1,2.000000,-97.000000
5,2017-07-16,Schoodic Woods Campground,Acadia National Park,1,2.000000,-97.000000
20,2017-09-29,Blackwoods (ME),Acadia National Park,3,5.666667,-3.000000
26,2017-09-29,Schoodic Woods Campground,Acadia National Park,1,5.000000,-3.000000
28,2017-09-30,Blackwoods (ME),Acadia National Park,5,5.800000,-3.800000
35,2017-09-30,Schoodic Woods Campground,Acadia National Park,4,4.000000,-3.000000
41,2017-10-01,Blackwoods (ME),Acadia National Park,35,2.457143,-1.485714
62,2017-10-01,Schoodic Woods Campground,Acadia National Park,11,2.636364,-1.818182
70,2017-10-02,Blackwoods (ME),Acadia National Park,67,2.104478,-1.149254


In [87]:
rez_18.to_csv("./data/Aggregated-Data/nps_agg_2018.csv")