In [4]:
import chardet
import numpy as np
import pandas as pd
import datetime
import codecs

import os
import glob

import progressbar
from time import sleep

import matplotlib.pyplot as plt

## Load and Format the Data
---

In [5]:
# specify filename to load for processing and aggregation
filename = './data/Raw-Data/reservations2018.csv'

In [6]:
# attempt to load the data using various codecs
types_of_encoding = ["utf8", "cp1252"]
for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = encoding_type, errors ='replace') as csvfile:
        rez = pd.read_csv(csvfile)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# function to format the imported data, drop all categories of reservations except campsites
def nps_site_format(df):
    # get only reservations from the National Park Service, OrdID of 128
    df = df[df['OrgID'] == 128]
    
    # get only site type entities
    df = df[df['EntityType'] == 'Site']
    
    # convert date columns to datetime, y-m-d
    df['EndDate'] = pd.to_datetime(df['EndDate'], format= '%Y-%m-%d')
    df['StartDate'] = pd.to_datetime(df['StartDate'], format= '%Y-%m-%d')
    df['OrderDate'] = pd.to_datetime(df['OrderDate'], format= '%Y-%m-%d')
    
    # calculate the stay length of the reservation
    df['StayLen'] = df['EndDate'] - df['StartDate']
    df['StayLen'] = df['StayLen'].dt.days + 1
    
    # calculate the days in advance the resveration was made
    # note: some reservations were recorded after the start date, those values are imputed as zero
    df['BookingHorizon'] = df['StartDate'] - df['OrderDate']
    df['BookingHorizon'] = df['BookingHorizon'].dt.days.clip(lower=0)
    
    # calculate the daily rate for each reservation
    df['DailyRate'] = df['TotalBeforeTax'] / df['StayLen']
    
    # fill nulls in fee columns with zeroes
    df[['UseFee', 'TranFee', 'AttrFee']] = df[['UseFee', 'TranFee', 'AttrFee']].fillna(0)
    
    # drop facility attribute columns
    df = df.drop(axis = 1, columns = ['FacilityState', 'FacilityLongitude', 'FacilityLatitude', 'UseType',
                                      'CustomerZIP', 'CustomerState', 'CustomerCountry', 'FacilityZIP', 'EntityID'])
    
    # drop categorical columns
    df = df.drop(axis = 1, columns = ['Tent', 'Popup', 'Trailer', 
                              'RVMotorhome', 'Boat', 'HorseTrailer', 'Car', 'FifthWheel', 
                              'Van', 'CanoeKayak', 'BoatTrailer', 'Motorcycle', 'Truck', 
                              'Bus', 'Bicycle', 'Snowmobile', 'OffRoadlAllTerrainVehicle', 
                              'PowerBoat', 'PickupCamper', 'LargeTentOver9x12', 'SmallTent', 'Marinaboat'])
    
    # drop nulls in important columns
    df = df.dropna(axis = 0, subset = ['FacilityID', 'StartDate', 'EndDate'])
    
    # drop duplicate order numbers
    df = df.drop_duplicates(['OrderNumber'])
    
    return df

In [8]:
# format imported data
rez = nps_site_format(rez)

In [9]:
rez.describe()

Unnamed: 0,HistoricalReservationID,OrgID,ParentLocationID,LegacyFacilityID,ProductID,FacilityID,Tax,UseFee,TranFee,AttrFee,TotalBeforeTax,NumberOfPeople,StayLen,BookingHorizon,DailyRate
count,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0,662449.0
mean,3025935000.0,128.0,74315.097582,81470.510688,270612.725366,236953.500748,0.0,0.0,0.0,0.0,43.11785,4.190324,3.230303,71.972419,13.10123
std,116719200.0,0.0,22.234118,24266.117767,95702.673307,10330.865669,0.0,0.0,0.0,0.0,50.691111,8.147858,1.702771,69.160976,15.58209
min,2319620000.0,128.0,74265.0,70851.0,139983.0,232432.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2927693000.0,128.0,74324.0,70940.0,203302.0,232461.0,0.0,0.0,0.0,0.0,18.0,2.0,2.0,9.0,8.0
50%,3018052000.0,128.0,74325.0,70971.0,207039.0,232490.0,0.0,0.0,0.0,0.0,30.0,3.0,3.0,48.0,12.0
75%,3124163000.0,128.0,74327.0,74045.0,318327.0,234059.0,0.0,0.0,0.0,0.0,52.0,5.0,4.0,132.0,16.0
max,3234612000.0,128.0,74330.0,161392.0,484727.0,273848.0,0.0,0.0,0.0,0.0,4410.0,600.0,56.0,659.0,800.0


In [10]:
rez.isnull().sum()

HistoricalReservationID    0
OrderNumber                0
Agency                     0
OrgID                      0
CodeHierarchy              0
RegionCode                 0
RegionDescription          0
ParentLocationID           0
ParentLocation             0
LegacyFacilityID           0
Park                       0
SiteType                   0
ProductID                  0
EntityType                 0
FacilityID                 0
Tax                        0
UseFee                     0
TranFee                    0
AttrFee                    0
TotalBeforeTax             0
TotalPaid                  0
StartDate                  0
EndDate                    0
OrderDate                  0
NumberOfPeople             0
LatLongPoint               0
StayLen                    0
BookingHorizon             0
DailyRate                  0
dtype: int64

In [11]:
rez.dtypes

HistoricalReservationID             int64
OrderNumber                        object
Agency                             object
OrgID                             float64
CodeHierarchy                      object
RegionCode                         object
RegionDescription                  object
ParentLocationID                    int64
ParentLocation                     object
LegacyFacilityID                    int64
Park                               object
SiteType                           object
ProductID                           int64
EntityType                         object
FacilityID                        float64
Tax                               float64
UseFee                            float64
TranFee                           float64
AttrFee                           float64
TotalBeforeTax                    float64
TotalPaid                          object
StartDate                  datetime64[ns]
EndDate                    datetime64[ns]
OrderDate                  datetim

In [12]:
for dtype in ['float','int','object']:
    selected_dtype = rez.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for float columns: 5.05 MB
Average memory usage for int columns: 5.05 MB
Average memory usage for object columns: 42.11 MB


In [13]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [14]:
rez_int = rez.select_dtypes(include=['int'])
converted_int = rez_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(rez_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([rez_int.dtypes, converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

5.05 MB
5.05 MB


Unnamed: 0,before,after


In [15]:
rez_float = rez.select_dtypes(include=['float'])
converted_float = rez_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(rez_float))
print(mem_usage(converted_float))
compare_floats = pd.concat([rez_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

50.54 MB
27.80 MB


Unnamed: 0,before,after
float32,,9.0
float64,9.0,


In [16]:
optimized_rez = rez.copy()
optimized_rez[converted_int.columns] = converted_int
optimized_rez[converted_float.columns] = converted_float
print(mem_usage(rez))
print(mem_usage(optimized_rez))

596.25 MB
573.51 MB


In [17]:
rez.to_csv("./Data/Cleaned-Data/nps_res_2018.csv", index = False)
optimized_rez.to_csv("./Data/Cleaned-Data/nps_optimized_2018.csv", index = False)

In [18]:
optimized_rez.head()

Unnamed: 0,HistoricalReservationID,OrderNumber,Agency,OrgID,CodeHierarchy,RegionCode,RegionDescription,ParentLocationID,ParentLocation,LegacyFacilityID,...,TotalBeforeTax,TotalPaid,StartDate,EndDate,OrderDate,NumberOfPeople,LatLongPoint,StayLen,BookingHorizon,DailyRate
9198,2832024458,2-39287897,NPS,128.0,|1|70904|74268|74293|70852|,SHEN-4840,Shenandoah National Park,74268,Northeast Region,70852,...,15.0,15,2017-10-22,2017-10-23,2017-10-02 17:27:34,2.0,<e6100000 010cf241 cf66d51f 43403c03 c1a7eaaa ...,2,19,7.5
9199,2833873375,2-39298455,NPS,128.0,|1|70904|74268|74293|70852|,SHEN-4840,Shenandoah National Park,74268,Northeast Region,70852,...,30.0,30,2017-10-13,2017-10-15,2017-10-05 11:39:37,2.0,<e6100000 010cf241 cf66d51f 43403c03 c1a7eaaa ...,3,7,10.0
9200,2844146454,2-39349948,NPS,128.0,|1|70904|74268|74293|70852|,SHEN-4840,Shenandoah National Park,74268,Northeast Region,70852,...,15.0,15,2017-10-23,2017-10-24,2017-10-20 16:38:17,2.0,<e6100000 010cf241 cf66d51f 43403c03 c1a7eaaa ...,2,2,7.5
9201,2871898640,2-39503019,NPS,128.0,|1|70904|74268|74293|70852|,SHEN-4840,Shenandoah National Park,74268,Northeast Region,70852,...,40.0,40,2018-10-05,2018-10-07,2018-05-22 13:49:09,2.0,<e6100000 010cf241 cf66d51f 43403c03 c1a7eaaa ...,3,135,13.333333
9202,2871968354,2-39503295,NPS,128.0,|1|70904|74268|74293|70852|,SHEN-4840,Shenandoah National Park,74268,Northeast Region,70852,...,30.0,30,2018-05-26,2018-05-28,2017-12-19 13:14:36,3.0,<e6100000 010cf241 cf66d51f 43403c03 c1a7eaaa ...,3,157,10.0


In [19]:
optimized_rez.groupby('Park').agg({'ProductID': 'nunique'})

Unnamed: 0_level_0,ProductID
Park,Unnamed: 1_level_1
ABRAM'S CREEK CAMPGROUND,16
APGAR CAMPGROUND GROUP SITES (MT),5
APPALACHIAN CLUBHOUSE (TN),1
ASSATEAGUE ISLAND NATIONAL SEASHORE CAMPGROUND,158
Adirondack Shelters,2
Akers (MO),4
Alley Spring (MO),101
Alosa Campsites,6
Anacapa Island (CA),7
Anthony Creek Horse Camp (TN),3


## Testing Fields
---

In [21]:
datelist = pd.date_range(optimized_rez['StartDate'].min(), optimized_rez['EndDate'].max()).tolist()
jan_1 = datelist[datelist.index(pd.Timestamp('2018-01-01'))]
print(jan_1.date())
jan_1_filter = (rez['StartDate'] <= jan_1) & (rez['EndDate'] >= jan_1)

2018-01-01


In [22]:
day = optimized_rez[jan_1_filter].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum})

In [23]:
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]

In [24]:
date_park_list = []
test_len = optimized_rez[jan_1_filter].groupby(['Park', 'RegionDescription']).mean().shape[0]
bar = progressbar.ProgressBar(maxval=test_len, widgets=widgets)

bar.start()
i = 0

for index, row in optimized_rez[jan_1_filter].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
    
    day_group = optimized_rez[jan_1_filter].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum, 'ProductID': 'nunique'})
    
    stay_len = day_group.loc[index[2], 'StayLen']
    book_hor = day_group.loc[index[2], 'BookingHorizon']
    avg_fee = day_group.loc[index[2], 'UseFee']
    num_people = day_group.loc[index[2], 'NumberOfPeople']
    day_revenue = day_group.loc[index[2], 'DailyRate']
    sites_booked = day_group.loc[index[2], 'ProductID']
    
    
    date_park_list.append([jan_1, index[2], index[0], index[1], row['OrderNumber'], num_people, sites_booked, stay_len, book_hor, avg_fee, day_revenue])
    sleep(0.00001)
    
    i += 1
    bar.update(i)
bar.finish()



In [27]:
rez_test = pd.DataFrame(date_park_list, columns = ['Date', 'FacilityID', 'Site', 
                                        'Park', 'Reservations', 'NumberOfPeople', 'SitesBooked',
                                        'AvgStayLen', 'AvgBookingHorizon', 'AverageFee', 'DailyRevenue'])

rez_test.head()

Unnamed: 0,Date,FacilityID,Site,Park,Reservations,NumberOfPeople,SitesBooked,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
0,2018-01-01,233321.0,Adirondack Shelters,Catoctin Mountain Park,2,7.0,2,2.5,27.5,0.0,7.5
1,2018-01-01,232502.0,Anacapa Island (CA),Channel Islands National Park,3,8.0,3,2.0,14.333333,0.0,20.0
2,2018-01-01,234723.0,BLACK ROCK EQUESTRIAN CAMPGROUND,Joshua Tree National Park,6,29.0,1,3.333333,20.166667,0.0,48.333332
3,2018-01-01,232473.0,Black Rock (CA),Joshua Tree National Park,119,452.0,76,2.966387,40.941176,0.0,1274.444458
4,2018-01-01,250901.0,Bonita Canyon Campground,Chiricahua National Monument,29,70.0,19,3.172414,16.758621,0.0,197.0


## Site Aggregation for Modeling
---

In [28]:
# new function for aggregating the data by campsite and date
def nps_site_aggregator(df):
    
    # establish list for each site's daily stats
    site_list = []
    
    # create list of days to run through
    datelist = pd.date_range(df['StartDate'].min(), df['EndDate'].max()).tolist()
    
    # create progress bar object
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]
    
    bar = progressbar.ProgressBar(maxval=len(datelist), widgets=widgets)
    
    bar.start()
    count = 0
    
    # cycle through all days
    for date in datelist:
        
        #create a mask to find only reservations that inlcude the specified date
        date_mask = (df['StartDate'] <= date) & (df['EndDate'] >= date)
        
        # cycle through all resvervations, as grouped by the site and the park
        for index, row in df[date_mask].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
            
            # group the filtered dataframe by unique site ID, aggregate the columns appropriately
            day_group = df[date_mask].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum, 'ProductID': 'nunique'})
            # assign variables for the daily 
            stay_len = day_group.loc[index[2], 'StayLen']
            book_hor = day_group.loc[index[2], 'BookingHorizon']
            avg_fee = day_group.loc[index[2], 'UseFee']
            num_people = day_group.loc[index[2], 'NumberOfPeople']
            day_revenue = day_group.loc[index[2], 'DailyRate']
            sites_booked = day_group.loc[index[2], 'ProductID']
            
            # add daily reservation information to the list
            site_list.append([date.date(), index[2], index[0], index[1], 
                              row['OrderNumber'], num_people, sites_booked, stay_len, 
                              book_hor, avg_fee, day_revenue])
            
        
        count += 1
        bar.update(count)
    
    bar.finish()
    return pd.DataFrame(site_list, columns = ['Date', 'FacilityID', 'Site', 'Park', 'Reservations', 
                                              'NumberOfPeople', 'SitesBooked', 'AvgStayLen', 'AvgBookingHorizon', 
                                              'AverageFee', 'DailyRevenue'])

In [None]:
# run site aggregator
# note: this may take some time depending on the size and parameters of the dataframe
rez_agg = nps_site_aggregator(optimized_rez)

In [None]:
rez_agg['Park'].unique()

In [None]:
rez_agg.to_csv("./data/Aggregated-Data/nps_agg_2014.csv", index = False)
print(mem_usage(rez_agg))

In [None]:
rez_agg.groupby('Site').mean()

## Combined Data EDA and Preprocessing
---

In [None]:
# set directory to collect aggregated csv files
os.chdir("./data/Aggregated-Data")

In [None]:
# get filenames of csv files in the aggregated data folder
extension = 'csv'

# create list of filenames
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
print(all_filenames)

In [None]:
#combine all files in the list
combined_rez = pd.concat([pd.read_csv(f) for f in all_filenames ])

# change directory back to repository base
os.chdir("../..")

#export to csv
combined_rez.to_csv( "./data/nps_combined_agg.csv", index=False, encoding='utf-8-sig')

In [None]:
# load aggregated dataframe
#combined_rez = pd.read_csv('./data/nps_combined_agg.csv')

In [None]:
# check size and shape of combined dataframe
print(mem_usage(combined_rez))s
print(combined_rez.shape)

In [None]:
# load in campsite data collected from RIDB
site_data = pd.read_csv('./data/nps_site_names.csv')