In [1]:
import chardet
import numpy as np
import pandas as pd
import datetime
import codecs

import progressbar
from time import sleep

import matplotlib.pyplot as plt

In [33]:
filename = './data/Raw-Data/reservations2016.csv'

In [34]:
# attempt to load the data using various codecs
types_of_encoding = ["utf8", "cp1252"]
for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = encoding_type, errors ='replace') as csvfile:
        rez = pd.read_csv(csvfile)

In [36]:
# function to format the imported data, drop all categories of reservations except campsites
def nps_site_format(df):
    # get only reservations from the National Park Service, OrdID of 128
    df = df[df['OrgID'] == 128]
    
    # get only site type entities
    df = df[df['EntityType'] == 'Site']
    
    # convert date columns to datetime, y-m-d
    df['EndDate'] = pd.to_datetime(df['EndDate'], format= '%Y-%m-%d')
    df['StartDate'] = pd.to_datetime(df['StartDate'], format= '%Y-%m-%d')
    df['OrderDate'] = pd.to_datetime(df['OrderDate'], format= '%Y-%m-%d')
    
    # calculate the stay length of the reservation
    df['StayLen'] = df['EndDate'] - df['StartDate']
    df['StayLen'] = df['StayLen'].dt.days + 1
    
    # calculate the days in advance the resveration was made
    # note: some reservations were recorded after the start date, those values are imputed as zero
    df['BookingHorizon'] = df['StartDate'] - df['OrderDate']
    df['BookingHorizon'] = df['BookingHorizon'].dt.days.clip(lower=0)
    
    # calculate the daily rate for each reservation
    df['DailyRate'] = df['TotalBeforeTax'] / df['StayLen']
    
    # fill nulls in fee columns with zeroes
    df[['UseFee', 'TranFee', 'AttrFee']] = df[['UseFee', 'TranFee', 'AttrFee']].fillna(0)
    
    # drop facility attribute columns
    df = df.drop(axis = 1, columns = ['FacilityState', 'FacilityLongitude', 'FacilityLatitude', 'UseType',
                                      'CustomerZIP', 'CustomerState', 'CustomerCountry', 'FacilityZIP', 'EntityID'])
    
    # drop categorical columns
    df = df.drop(axis = 1, columns = ['Tent', 'Popup', 'Trailer', 
                              'RVMotorhome', 'Boat', 'HorseTrailer', 'Car', 'FifthWheel', 
                              'Van', 'CanoeKayak', 'BoatTrailer', 'Motorcycle', 'Truck', 
                              'Bus', 'Bicycle', 'Snowmobile', 'OffRoadlAllTerrainVehicle', 
                              'PowerBoat', 'PickupCamper', 'LargeTentOver9x12', 'SmallTent', 'Marinaboat'])
    
    # drop nulls in important columns
    df = df.dropna(axis = 0, subset = ['FacilityID', 'StartDate', 'EndDate'])
    
    # drop duplicate order numbers
    df = df.drop_duplicates(['OrderNumber'])
    
    return df

In [37]:
# format imported data
rez = nps_site_format(rez)

In [39]:
rez.describe()

Unnamed: 0,HistoricalReservationID,OrgID,ParentLocationID,LegacyFacilityID,ProductID,FacilityID,Tax,UseFee,TranFee,AttrFee,TotalBeforeTax,TotalPaid,NumberOfPeople,StayLen,BookingHorizon,DailyRate
count,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0,563652.0
mean,2259945000.0,128.0,74316.074422,75137.555577,244222.35125,234226.592271,0.050863,40.355499,1.933514,0.269913,43.216837,43.269493,4.347383,3.239749,69.645636,13.154881
std,101340400.0,0.0,21.304032,11797.613295,68644.092446,4793.422482,0.790875,49.669378,4.501525,3.509444,48.745901,48.837979,8.435554,1.690116,67.763487,14.807397
min,2078084000.0,128.0,74265.0,70851.0,139983.0,232432.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2174039000.0,128.0,74324.0,70939.0,203108.0,232459.0,0.0,18.0,0.0,0.0,18.0,18.0,2.0,2.0,9.0,8.0
50%,2257959000.0,128.0,74325.0,70971.0,205623.0,232490.0,0.0,27.0,0.0,0.0,30.0,30.0,3.0,3.0,46.0,12.0
75%,2344666000.0,128.0,74327.0,72393.0,295132.0,233379.0,0.0,52.0,0.0,0.0,52.0,52.0,5.0,4.0,128.0,16.0
max,2434325000.0,128.0,74330.0,147290.0,443021.0,253502.0,62.4,2940.0,410.0,196.0,2940.0,2940.0,600.0,31.0,852.0,800.0


In [40]:
rez.isnull().sum()

HistoricalReservationID    0
OrderNumber                0
Agency                     0
OrgID                      0
CodeHierarchy              0
RegionCode                 0
RegionDescription          0
ParentLocationID           0
ParentLocation             0
LegacyFacilityID           0
Park                       0
SiteType                   0
ProductID                  0
EntityType                 0
FacilityID                 0
Tax                        0
UseFee                     0
TranFee                    0
AttrFee                    0
TotalBeforeTax             0
TotalPaid                  0
StartDate                  0
EndDate                    0
OrderDate                  0
NumberOfPeople             0
StayLen                    0
BookingHorizon             0
DailyRate                  0
dtype: int64

In [41]:
rez.dtypes

HistoricalReservationID             int64
OrderNumber                        object
Agency                             object
OrgID                               int64
CodeHierarchy                      object
RegionCode                         object
RegionDescription                  object
ParentLocationID                    int64
ParentLocation                     object
LegacyFacilityID                    int64
Park                               object
SiteType                           object
ProductID                           int64
EntityType                         object
FacilityID                        float64
Tax                               float64
UseFee                            float64
TranFee                           float64
AttrFee                           float64
TotalBeforeTax                    float64
TotalPaid                         float64
StartDate                  datetime64[ns]
EndDate                    datetime64[ns]
OrderDate                  datetim

In [42]:
for dtype in ['float','int','object']:
    selected_dtype = rez.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for float columns: 4.30 MB
Average memory usage for int columns: 4.30 MB
Average memory usage for object columns: 35.40 MB


In [43]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [44]:
rez_int = rez.select_dtypes(include=['int'])
converted_int = rez_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(rez_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([rez_int.dtypes, converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

4.30 MB
4.30 MB


Unnamed: 0,before,after


In [45]:
rez_float = rez.select_dtypes(include=['float'])
converted_float = rez_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(rez_float))
print(mem_usage(converted_float))
compare_floats = pd.concat([rez_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

47.30 MB
25.80 MB


Unnamed: 0,before,after
float32,,10.0
float64,10.0,


In [46]:
optimized_rez = rez.copy()
optimized_rez[converted_int.columns] = converted_int
optimized_rez[converted_float.columns] = converted_float
print(mem_usage(rez))
print(mem_usage(optimized_rez))

435.67 MB
414.17 MB


In [47]:
rez.to_csv("./Data/Cleaned-Data/nps_res_2016.csv", index = False)
optimized_rez.to_csv("./Data/Cleaned-Data/nps_optimized_2016.csv", index = False)

In [48]:
optimized_rez.head()

Unnamed: 0,HistoricalReservationID,OrderNumber,Agency,OrgID,CodeHierarchy,RegionCode,RegionDescription,ParentLocationID,ParentLocation,LegacyFacilityID,...,AttrFee,TotalBeforeTax,TotalPaid,StartDate,EndDate,OrderDate,NumberOfPeople,StayLen,BookingHorizon,DailyRate
5,2078083673,2-32732554,NPS,128,|1|70904|74327|74277|70978|,DEVA-8130,Death Valley National Park,74327,Pacific West Region,70978,...,0.0,10.0,10.0,2016-03-31,2016-04-03,2015-10-01,6,4.0,182.0,2.5
6,2078083710,2-32732555,NPS,128,|1|70904|74324|74282|70971|,GRCA-8210,Grand Canyon National Park,74324,Intermountain Region,70971,...,0.0,18.0,18.0,2015-10-04,2015-10-06,2015-10-01,2,3.0,3.0,6.0
7,2078083748,2-32732556,NPS,128,|1|70904|74268|74271|70990|,ACAD-1700,Acadia National Park,74268,Northeast Region,70990,...,0.0,60.0,60.0,2015-10-19,2015-10-21,2015-10-01,2,3.0,18.0,20.0
9,2078085589,2-32733468,NPS,128,|1|70904|74327|73983|73984|,PINN-8450,Pinnacles National Park,74327,Pacific West Region,73984,...,0.0,10.0,10.0,2015-11-26,2015-11-29,2015-10-01,4,4.0,56.0,2.5
16,2078089333,2-32733111,NPS,128,|1|70904|74327|74296|70928|,YOSE-8800,Yosemite National Park,74327,Pacific West Region,70928,...,0.0,10.0,10.0,2015-10-04,2015-10-05,2015-10-01,3,2.0,3.0,5.0


## Testing Fields
---

In [49]:
datelist = pd.date_range(optimized_rez['StartDate'].min(), optimized_rez['EndDate'].max()).tolist()
jan_1 = datelist[datelist.index(pd.Timestamp('2016-01-01'))]
print(jan_1.date())
jan_1_filter = (rez['StartDate'] <= jan_1) & (rez['EndDate'] >= jan_1)

2016-01-01


In [50]:
day = optimized_rez[jan_1_filter].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum})

In [51]:
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]

In [52]:
date_park_list = []
test_len = optimized_rez[jan_1_filter].groupby(['Park', 'RegionDescription']).mean().shape[0]
bar = progressbar.ProgressBar(maxval=test_len, widgets=widgets)

bar.start()
i = 0

for index, row in optimized_rez[jan_1_filter].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
    
    day_group = optimized_rez[jan_1_filter].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum})
    
    stay_len = day_group.loc[index[2], 'StayLen']
    book_hor = day_group.loc[index[2], 'BookingHorizon']
    avg_fee = day_group.loc[index[2], 'UseFee']
    num_people = day_group.loc[index[2], 'NumberOfPeople']
    day_revenue = day_group.loc[index[2], 'DailyRate']
    
    
    date_park_list.append([jan_1, index[2], index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor, avg_fee, day_revenue])
    sleep(0.00001)
    
    i += 1
    bar.update(i)
bar.finish()



In [53]:
rez_test = pd.DataFrame(date_park_list, columns = ['Date', 'FacilityID', 'Site', 
                                        'Park', 'Reservations', 'NumberOfPeople', 
                                        'AvgStayLen', 'AvgBookingHorizon', 'AverageFee', 'DailyRevenue'])

rez_test.head(20)

Unnamed: 0,Date,FacilityID,Site,Park,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
0,2016-01-01,232502.0,ANACAPA ISLAND,Channel Islands National Park,1,2,3.0,22.0,30.0,10.0
1,2016-01-01,232507.0,ASSATEAGUE ISLAND NATIONAL SEASHORE CAMPGROUND,Assateague Island National Seashore,1,2,3.0,5.0,100.0,33.333332
2,2016-01-01,233321.0,Adirondack Shelters,Catoctin Mountain Park,2,7,2.5,3.5,0.0,7.5
3,2016-01-01,232473.0,BLACK ROCK CAMPGROUND,Joshua Tree National Park,114,466,3.333333,38.894737,30.0,1153.511841
4,2016-01-01,234723.0,BLACK ROCK EQUESTRIAN CAMPGROUND,Joshua Tree National Park,2,5,2.5,2.0,25.0,18.333332
5,2016-01-01,246889.0,Bear Island Campground,Big Cypress National Preserve,31,94,3.903226,10.193548,27.258064,211.279755
6,2016-01-01,250901.0,Bonita Canyon Campground,Chiricahua National Monument,15,34,3.0,16.266666,16.799999,92.0
7,2016-01-01,246890.0,Burns Lake Campground,Big Cypress National Preserve,12,33,3.25,4.583333,41.0,164.933334
8,2016-01-01,233309.0,CAMP GATEWAY- BROOKLYN NY,Gateway National Recreation Area,6,9,7.0,2.666667,110.0,105.5
9,2016-01-01,234038.0,CHISOS BASIN (BIG BEND),Big Bend National Park,24,72,2.625,34.958332,17.208334,181.516663


## Site Aggregation for Modeling
---

In [57]:
# initial function for aggregating the data by date and campsite
# output format is correct, not as fast
def nps_site_aggregator_alternate(df):
    
    # establish list for each site's daily stats
    site_list = []
    
    # create list of days to run through
    datelist = pd.date_range(df['StartDate'].min(), df['EndDate'].max()).tolist()
    
    # create progress bar object
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]
    
    bar = progressbar.ProgressBar(maxval=len(datelist), widgets=widgets)
    
    bar.start()
    count = 0
    
    # cycle through all days
    for date in datelist:
        
        #create a mask to find only reservations that inlcude the specified date
        date_mask = (df['StartDate'] <= date) & (df['EndDate'] >= date)
        
        # cycle through all resvervations, as grouped by the site and the park
        for index, row in df[date_mask].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
            
           # get the average stay length, booking horizon, and use fee for the reservations for the date
            stay_len = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'StayLen']
            book_hor = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'BookingHorizon']
            avgfee = df[date_mask].groupby(['FacilityID']).mean().loc[index[2], 'UseFee']
            
            # get the total number of people per the grouped reservations
            num_people = df[date_mask].groupby(['FacilityID']).sum().loc[index[2], 'NumberOfPeople']
            
            # get the daily revenue from the daily rate for each reservation
            day_revenue = df[date_mask].groupby(['FacilityID']).sum().loc[index[2], 'DailyRate']
            
            # add daily reservation information to the list
            site_list.append([date.date(), index[2], index[0], index[1], row['OrderNumber'], num_people, stay_len, book_hor, avgfee, day_revenue])
            
        
        count += 1
        bar.update(count)
    
    bar.finish()
    return pd.DataFrame(site_list, columns = ['Date', 'FacilityID', 'Site', 'Park', 'Reservations', 
                                              'NumberOfPeople', 'AvgStayLen', 'AvgBookingHorizon', 'AverageFee', 'DailyRevenue'])

In [58]:
# new function for aggregating the data by campsite and date
def nps_site_aggregator(df):
    
    # establish list for each site's daily stats
    site_list = []
    
    # create list of days to run through
    datelist = pd.date_range(df['StartDate'].min(), df['EndDate'].max()).tolist()
    
    # create progress bar object
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(),
               ' ', progressbar.ETA(),
               ' ', progressbar.AdaptiveETA()]
    
    bar = progressbar.ProgressBar(maxval=len(datelist), widgets=widgets)
    
    bar.start()
    count = 0
    
    # cycle through all days
    for date in datelist:
        
        #create a mask to find only reservations that inlcude the specified date
        date_mask = (df['StartDate'] <= date) & (df['EndDate'] >= date)
        
        # cycle through all resvervations, as grouped by the site and the park
        for index, row in df[date_mask].groupby(['Park', 'RegionDescription', 'FacilityID']).nunique().iterrows():
            
            # group the filtered dataframe by unique site ID, aggregate the columns appropriately
            day_group = df[date_mask].groupby('FacilityID').agg({'StayLen': np.mean, 'BookingHorizon': np.mean,
                                                       'UseFee' : np.mean, 'NumberOfPeople' : np.sum, 
                                                       'DailyRate' : np.sum})
            # assign variables for the daily 
            stay_len = day_group.loc[index[2], 'StayLen']
            book_hor = day_group.loc[index[2], 'BookingHorizon']
            avg_fee = day_group.loc[index[2], 'UseFee']
            num_people = day_group.loc[index[2], 'NumberOfPeople']
            day_revenue = day_group.loc[index[2], 'DailyRate']
            
            # add daily reservation information to the list
            site_list.append([date.date(), index[2], index[0], index[1], 
                              row['OrderNumber'], num_people, stay_len, 
                              book_hor, avg_fee, day_revenue])
            
        
        count += 1
        bar.update(count)
    
    bar.finish()
    return pd.DataFrame(site_list, columns = ['Date', 'FacilityID', 'Site', 'Park', 'Reservations', 
                                              'NumberOfPeople', 'AvgStayLen', 'AvgBookingHorizon', 
                                              'AverageFee', 'DailyRevenue'])

In [59]:
# run site aggregator
# note: this may take a while depending on the size and parameters of the dataframe
rez_agg = nps_site_aggregator(optimized_rez)



In [61]:
rez_agg['Park'].unique()

array(['Chiricahua National Monument', 'Gateway National Recreation Area',
       'Big South Fork National River', 'Gulf Islands National Seashore',
       'Grand Canyon National Park', 'Acadia National Park',
       'Great Smoky Mountains National Park', 'Yosemite National Park',
       'Catoctin National Park', 'Point Reyes National Seashore',
       'Zion National Park', 'Channel Islands National Park',
       'Chickasaw National Recreation Area', 'Greenbelt Park',
       'Prince William Forest Park', 'Joshua Tree National Park',
       'Chaco Culture National Historic Park', 'Blue Ridge Parkway',
       'Shenandoah National Park', 'Big Cypress National Preserve',
       'Pinnacles National Park',
       'Sleeping Bear Dunes National Lakeshore',
       'Colorado National Monument',
       'Headquarters - Mammoth Cave National Park',
       'Lassen Volcanic National Park', 'Cape Lookout National Seashore',
       'Assateague Island National Seashore',
       'Headquarters - Arches Na

In [62]:
rez_agg.to_csv("./data/Aggregated-Data/nps_agg_2016.csv", index = False)
print(mem_usage(rez_agg))

10.08 MB


In [63]:
rez_agg.groupby('Site').mean()

Unnamed: 0_level_0,FacilityID,Reservations,NumberOfPeople,AvgStayLen,AvgBookingHorizon,AverageFee,DailyRevenue
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AKERS,234442.0,2.098765,42.506173,3.232922,71.677572,97.013992,66.666665
ALLEY SPRING,234046.0,24.448113,148.584906,4.783342,100.949797,55.430137,350.653306
ANACAPA ISLAND,232502.0,4.831897,16.599138,2.744207,43.393513,14.666176,34.310345
ANTHONY CREEK HORSE CAMP,232485.0,2.232394,7.654930,4.678504,47.393008,48.131455,23.521127
APGAR GROUP SITES,234669.0,5.437500,73.973958,4.819356,182.863553,126.683752,146.947917
APPALACHIAN CLUBHOUSE,233299.0,1.000000,70.422222,1.177778,196.888889,365.555556,338.444444
ASPENGLEN CAMPGROUND,233187.0,90.024194,283.935484,3.530329,96.549109,41.192783,1147.217747
ASSATEAGUE ISLAND NATIONAL SEASHORE CAMPGROUND,232507.0,124.245902,494.857923,4.206580,89.192492,63.917148,2007.875683
Adirondack Shelters,233321.0,1.810526,4.978947,2.580439,22.255000,0.000000,7.068421
Alosa Campsites,251575.0,4.457831,21.518072,2.000000,19.300096,0.000000,21.530120
