In [70]:
import chardet
import numpy as np
import pandas as pd
import datetime
import codecs

import progressbar
from time import sleep

import matplotlib.pyplot as plt

In [71]:
filename = './data/reservations2018.csv'

In [72]:
# attempt to load the data using various codecs
types_of_encoding = ["utf8", "cp1252"]
for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = encoding_type, errors ='replace') as csvfile:
        rez = pd.read_csv(csvfile)

  interactivity=interactivity, compiler=compiler, result=result)


In [73]:
# function to format the imported data, drop all categories of reservations except campsites
def nps_site_format(df):
    # get only reservations from the National Park Service, OrdID of 128
    df = df[df['OrgID'] == 128]
    
    # get only site type entities
    df = df[df['EntityType'] == 'Site']
    
    # convert date columns to datetime, y-m-d
    df['EndDate'] = pd.to_datetime(df['EndDate'], format= '%Y-%m-%d')
    df['StartDate'] = pd.to_datetime(df['StartDate'], format= '%Y-%m-%d')
    df['OrderDate'] = pd.to_datetime(df['OrderDate'], format= '%Y-%m-%d')
    
    # calculate the stay length of the reservation
    df['StayLen'] = df['EndDate'] - df['StartDate']
    df['StayLen'] = df['StayLen'].dt.days
    
    # calculate the days in advance the resveration was made
    # note: some reservations were recorded after the start date, those values are imputed as zero
    df['BookingHorizon'] = df['StartDate'] - df['OrderDate']
    df['BookingHorizon'] = df['BookingHorizon'].dt.days.clip(lower=0)
    
    # calculate the daily rate for each reservation
    df['DailyRate'] = df['TotalBeforeTax'] / df['StayLen']
    
    # drop categorical columns
    df = df.drop(axis = 1, columns = ['Tent', 'Popup', 'Trailer', 
                              'RVMotorhome', 'Boat', 'HorseTrailer', 'Car', 'FifthWheel', 
                              'Van', 'CanoeKayak', 'BoatTrailer', 'Motorcycle', 'Truck', 
                              'Bus', 'Bicycle', 'Snowmobile', 'OffRoadlAllTerrainVehicle', 
                              'PowerBoat', 'PickupCamper', 'LargeTentOver9x12', 'SmallTent', 'Marinaboat'])
    
    # drop nulls in important columns
    df = df.dropna(axis = 0, subset = ['EntityID', 'FacilityID', 'FacilityZIP', 'StartDate', 'EndDate'])
    
    # drop duplicate order numbers
    df = df.drop_duplicates(['OrderNumber'])
    
    return df

In [74]:
# format imported data
rez = nps_site_format(rez)

In [75]:
rez.describe()

Unnamed: 0,HistoricalReservationID,OrgID,ParentLocationID,LegacyFacilityID,ProductID,EntityID,FacilityID,FacilityLongitude,FacilityLatitude,Tax,UseFee,TranFee,AttrFee,TotalBeforeTax,NumberOfPeople,StayLen,BookingHorizon,DailyRate
count,641165.0,641165.0,641165.0,641165.0,641165.0,641165.0,641165.0,641165.0,641165.0,641165.0,0.0,0.0,0.0,641165.0,641165.0,641165.0,641165.0,641009.0
mean,3024485000.0,128.0,74314.770905,81675.169102,270119.166332,33347.649962,236896.745657,-105.241944,38.094356,0.0,,,,42.675707,4.201978,2.223382,72.770579,inf
std,116989200.0,0.0,22.480798,24456.837204,95711.484301,43092.357043,10367.658134,17.000323,4.159204,0.0,,,,50.247784,8.230441,1.695946,69.43187,
min,2319620000.0,128.0,74265.0,70851.0,139983.0,1.0,232432.0,-154.177,25.7612,0.0,,,,0.0,0.0,0.0,0.0,0.0
25%,2924886000.0,128.0,74324.0,70941.0,203285.0,2072.0,232461.0,-119.5625,36.049722,0.0,,,,18.0,2.0,1.0,10.0,13.0
50%,3016472000.0,128.0,74325.0,70971.0,207039.0,4697.0,232490.0,-112.120469,37.573056,0.0,,,,30.0,3.0,2.0,49.0,20.0
75%,3122936000.0,128.0,74327.0,74046.0,317918.0,91082.0,234059.0,-87.274167,38.776944,0.0,,,,52.0,5.0,3.0,134.0,26.0
max,3234612000.0,128.0,74330.0,161392.0,484727.0,105163.0,273848.0,-68.068361,60.309333,0.0,,,,4410.0,600.0,55.0,659.0,inf


In [54]:
rez.isnull().sum()

HistoricalReservationID       0
OrderNumber                   0
Agency                        0
OrgID                         0
CodeHierarchy                 0
RegionCode                    0
RegionDescription             0
ParentLocationID              0
ParentLocation                0
LegacyFacilityID              0
Park                          0
SiteType                      0
UseType                       0
ProductID                     0
EntityType                    0
EntityID                      0
FacilityID                    0
FacilityZIP                   0
FacilityState                 0
FacilityLongitude             0
FacilityLatitude              0
CustomerZIP                1661
CustomerState              3143
CustomerCountry               0
Tax                           0
UseFee                        0
TranFee                       0
AttrFee                       0
TotalBeforeTax                0
TotalPaid                     0
StartDate                     0
EndDate 

In [55]:
for dtype in ['float','int','object']:
    selected_dtype = rez.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for float columns: 1.30 MB
Average memory usage for int columns: 1.30 MB
Average memory usage for object columns: 10.39 MB


In [56]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [57]:
rez_int = rez.select_dtypes(include=['int'])
converted_int = rez_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(rez_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([rez_int.dtypes, converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

9.07 MB
4.21 MB


Unnamed: 0,before,after
uint8,,2.0
uint32,,4.0
int64,6.0,


In [58]:
rez_float = rez.select_dtypes(include=['float'])
converted_float = rez_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(rez_float))
print(mem_usage(converted_float))
compare_floats = pd.concat([rez_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

18.15 MB
9.72 MB


Unnamed: 0,before,after
float32,,13.0
float64,13.0,


In [59]:
optimized_rez = rez.copy()
optimized_rez[converted_int.columns] = converted_int
optimized_rez[converted_float.columns] = converted_float
print(mem_usage(rez))
print(mem_usage(optimized_rez))

194.71 MB
181.43 MB


In [79]:
rez.to_csv("./Data/Cleaned-Data/nps_res_2018.csv")
optimized_rez.to_csv("./Data/Cleaned-Data/nps_optimized_2018.csv")

In [82]:
optimized_rez.head()

Unnamed: 0,HistoricalReservationID,OrderNumber,Agency,OrgID,CodeHierarchy,RegionCode,RegionDescription,ParentLocationID,ParentLocation,LegacyFacilityID,...,AttrFee,TotalBeforeTax,TotalPaid,StartDate,EndDate,OrderDate,NumberOfPeople,StayLen,BookingHorizon,DailyRate
4,2434322626,2-36683841,NPS,128,|1|70904|74324|74282|70971|,GRCA-8210,Grand Canyon National Park,74324,Intermountain Region,70971,...,0.0,18.0,18.0,2016-10-16,2016-10-17,2016-10-01,4,1.0,15.0,18.0
6,2434322789,2-36683844,NPS,128,|1|70904|74324|74282|70971|,GRCA-8210,Grand Canyon National Park,74324,Intermountain Region,70971,...,0.0,18.0,18.0,2016-10-04,2016-10-05,2016-10-01,2,1.0,3.0,18.0
11,2434323612,2-36683171,NPS,128,|1|70904|74327|74277|70978|,DEVA-8130,Death Valley National Park,74327,Pacific West Region,70978,...,0.0,44.0,44.0,2016-10-16,2016-10-18,2016-10-01,2,2.0,15.0,22.0
13,2434324034,2-36683173,NPS,128,|1|70904|74327|74286|70952|,JOTR-8330,Joshua Tree National Park,74327,Pacific West Region,70952,...,0.0,10.0,10.0,2017-03-10,2017-03-12,2016-10-01,5,2.0,160.0,5.0
14,2434324274,2-36683174,NPS,128,|1|70904|74324|74282|70971|,GRCA-8210,Grand Canyon National Park,74324,Intermountain Region,70971,...,0.0,18.0,18.0,2016-10-03,2016-10-04,2016-10-01,2,1.0,2.0,18.0


## Testing Fields
---

In [61]:
datelist = pd.date_range(rez['StartDate'].min(), rez['EndDate'].max()).tolist()
print(datelist[94].date())
jan_1 = datelist[94]
jan_1_filter = (rez['StartDate'] <= jan_1) & (rez['EndDate'] >= jan_1)

2017-01-01


In [62]:
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]

In [63]:
date_park_list = []
test_len = rez[jan_1_filter].groupby(['Park', 'RegionDescription']).mean().shape[0]
bar = progressbar.ProgressBar(maxval=test_len, widgets=widgets)

bar.start()
i = 0avgfee = rez[jan_1_filter].groupby(['FacilityID']).mean().loc[index[2], 'UseFee']
for index, row in rez[jan_1_filter].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
    stay_len = rez[jan_1_filter].groupby(['FacilityID']).mean().loc[index[2], 'StayLen']
    book_hor = rez[jan_1_filter].groupby(['FacilityID']).mean().loc[index[2], 'BookingHorizon']
    avgfee = rez[jan_1_filter].groupby(['FacilityID']).mean().loc[index[2], 'UseFee']
    
    
    num_people = rez[jan_1_filter].groupby(['FacilityID']).sum().loc[index[2], 'NumberOfPeople']
    day_revenue = rez[jan_1_filter].groupby(['FacilityID']).sum().loc[index[2], 'DailyRate']
    
    date_park_list.append([jan_1, index[2], index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor, avgfee, day_revenue])
    bar.update(i+1)
    sleep(0.0001)
    i += 1
bar.finish()



In [67]:
pd.DataFrame(date_park_list, columns = ['Date', 'FacilityID', 'Site', 
                                        'Park', 'Reservations', 'NumberOfPeople', 
                                        'AvgStayLen', 'AvgBookingHorizon', 'AverageFee', 'DailyRevenue'])

Unnamed: 0,Date,FacilityID,Site,Park,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
0,2017-01-01,234038.0,CHISOS BASIN (BIG BEND),Big Bend National Park,18,39,1.5,37.388889,19.444444,244.0
1,2017-01-01,234013.0,CHISOS BASIN GROUP CAMPGROUND,Big Bend National Park,6,55,2.5,40.5,31.5,107.666667
2,2017-01-01,234078.0,COTTONWOOD (TX),Big Bend National Park,1,9,1.0,88.0,27.0,27.0
3,2017-01-01,232471.0,COTTONWOOD GROUP,Joshua Tree National Park,5,46,1.2,15.6,47.0,195.0
4,2017-01-01,250796.0,Davis Bayou Campground,Gulf Islands National Seashore,52,118,5.807692,24.25,57.961538,604.595238
5,2017-01-01,232496.0,FURNACE CREEK,Death Valley National Park,104,389,2.567308,40.586538,39.788269,2108.22
6,2017-01-01,232474.0,GREENBELT CAMPGROUND,Greenbelt Park,3,7,4.666667,18.333333,48.666667,27.5
7,2017-01-01,232472.0,INDIAN COVE CAMPGROUND,Joshua Tree National Park,88,347,1.806818,22.386364,30.511364,1660.0
8,2017-01-01,234035.0,RIO GRANDE VILLAGE (BIG BEND),Big Bend National Park,50,153,2.54,44.04,27.72,634.166667
9,2017-01-01,234014.0,RIO GRANDE VILLAGE GROUP CAMPGROUND,Big Bend National Park,3,28,2.0,39.333333,55.0,84.0


## Site Aggregation for Modeling
---

In [84]:
def nps_site_aggregator(df):
    
    # establish list for each site's daily stats
    site_list = []
    
    # create list of days to run through
    datelist = pd.date_range(df['StartDate'].min(), df['EndDate'].max()).tolist()
    
    # create progress bar object
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]
    
    bar = progressbar.ProgressBar(maxval=len(datelist), widgets=widgets)
    
    bar.start()
    count = 0
    
    # cycle through all days
    for date in datelist:
        
        #create a mask to find only reservations that inlcude the specified date
        date_mask = (df['StartDate'] <= date) & (df['EndDate'] >= date)
        
        # cycle through all resvervations, as grouped by the site and the park
        for index, row in df[date_mask].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
            
            # get the average stay length, booking horizon, and use fee for the reservations for the date
            stay_len = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'StayLen']
            book_hor = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'BookingHorizon']
            avgfee = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'UseFee']
            
            # get the total number of people per the grouped reservations
            num_people = df[date_mask].groupby(['FacilityID']).sum().loc[index[2], 'NumberOfPeople']
            
            # get the daily revenue from the daily rate for each reservation
            day_revenue = df[date_mask].groupby(['FacilityID']).sum().loc[index[2], 'DailyRate']
            
            # add daily reservation information to the list
            site_list.append([date.date(), index[2], index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor, avgfee, day_revenue])
            
        
        count += 1
        bar.update(count)
    
    bar.finish()
    return pd.DataFrame(site_list, columns = ['Date', 'FacilityID', 'Site', 'Park', 'Reservations', 
                                              'NumberOfPeople', 'AvgStayLen', 'AvgBookingHorizon', 'AverageFee', 'DailyRevenue'])

In [85]:
# run site aggregator
# note: this may take a while depending on the size and parameters of the dataframe
rez_18 = nps_site_aggregator(optimized_rez)



In [86]:
rez_18[rez_18['Park'].str.contains('Acadia')]

Unnamed: 0,Date,FacilityID,Site,Park,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
2570,2017-05-01,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,26,53.0,3.153846,15.500000,80.769234,647.500000
2587,2017-05-02,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,39,77.0,2.871795,15.487180,73.076920,1012.500000
2603,2017-05-03,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,54,107.0,2.814815,14.222222,74.166664,1457.500000
2623,2017-05-04,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,58,113.0,2.948276,21.293104,76.810349,1549.166626
2649,2017-05-05,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,68,154.0,2.882353,30.735294,66.397057,1657.500000
2680,2017-05-06,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,73,191.0,2.589041,32.150684,59.178082,1824.166626
2710,2017-05-07,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,86,213.0,2.674419,29.569767,57.383720,2121.130859
2739,2017-05-08,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,99,213.0,2.878788,29.343435,63.484848,2531.130859
2760,2017-05-09,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,106,201.0,2.886792,28.150944,62.971699,2631.964355
2780,2017-05-10,232508.0,BLACKWOODS CAMPGROUND,Acadia National Park,103,219.0,2.990291,29.533981,66.407768,2591.964355


In [87]:
rez_18.to_csv("./data/Aggregated-Data/nps_agg_2018.csv")

In [90]:
rez_18.groupby('Site').mean()

Unnamed: 0_level_0,FacilityID,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AKERS,234442.0,2.493506,53.688312,2.073810,62.476840,78.616667,94.712121
ALLEY SPRING,234046.0,23.492611,126.950739,4.279566,110.018161,33.590445,inf
BIG MEADOWS,232459.0,155.432039,540.461165,2.966263,68.070272,41.214509,2554.130023
BIG SPRING,234044.0,12.465753,63.534247,4.795123,108.546548,4.149990,inf
BLACKWOODS CAMPGROUND,232508.0,332.027322,999.153005,3.500477,67.376807,81.281190,8382.777213
BUTTE LAKE,234156.0,40.843750,145.989583,2.797190,55.537986,28.150376,489.901448
BUTTE LAKE GROUP,234162.0,3.891089,78.326733,3.608938,165.660600,108.223047,112.121405
BUTTE LAKE STOCK CORRAL,233356.0,1.215686,6.000000,4.588235,81.653595,58.470588,22.241830
Black Canyon of the Gunnison NP South Rim Campground,234052.0,75.478992,223.563025,2.414026,70.890164,25.384824,1126.975888
CENTRAL GROUP CAMP (OK) CHICKASAW NRA,234050.0,5.595092,92.337423,2.313675,65.389892,55.234446,158.054704
