# NATIONAL PARKS VISITATION ANALYSIS
### Table of Contents

01. Libraries Import
02. Data Import
03. Data Wrangling
- 3.1 Dropping Columns
- 3.2 Data Consistency Checks
- 3.3 Basic Statistics & Data Type Changing
04. Export Data

## 01. Libraries Import

In [1]:
# Libraries Imported

import pandas as pd
import numpy as np
import os

## 02. Data Import

In [2]:
# Create a string path

path = r'C:\Users\rasmu\Documents\CareerFoundry\A6 National Parks Analysis\02 Data'

In [3]:
# Importing Dataset 

df = pd.read_csv(os.path.join(path, 'Orig Data', 'NatlParksRawData.csv'))

In [4]:
# Print head

df.head()

Unnamed: 0,ParkName,UnitCode,ParkType,Region,State,Year,Month,RecreationVisits,NonRecreationVisits,RecreationHours,...,NonRecreationVisitsTotal,RecreationHoursTotal,NonRecreationHoursTotal,ConcessionerLodgingTotal,ConcessionerCampingTotal,TentCampersTotal,RVCampersTotal,BackcountryTotal,NonRecreationOvernightStaysTotal,MiscellaneousOvernightStaysTotal
0,Acadia NP,ACAD,National Park,Northeast,ME,2012,1,11930,600,78738,...,47100,12458831,47100,0,0,124482,26760,1316,0,8083
1,Acadia NP,ACAD,National Park,Northeast,ME,2012,2,12388,600,81758,...,47100,12458831,47100,0,0,124482,26760,1316,0,8083
2,Acadia NP,ACAD,National Park,Northeast,ME,2012,3,22684,600,149716,...,47100,12458831,47100,0,0,124482,26760,1316,0,8083
3,Acadia NP,ACAD,National Park,Northeast,ME,2012,4,59471,600,344929,...,47100,12458831,47100,0,0,124482,26760,1316,0,8083
4,Acadia NP,ACAD,National Park,Northeast,ME,2012,5,157406,7500,894751,...,47100,12458831,47100,0,0,124482,26760,1316,0,8083


In [5]:
# Print tail

df.tail()

Unnamed: 0,ParkName,UnitCode,ParkType,Region,State,Year,Month,RecreationVisits,NonRecreationVisits,RecreationHours,...,NonRecreationVisitsTotal,RecreationHoursTotal,NonRecreationHoursTotal,ConcessionerLodgingTotal,ConcessionerCampingTotal,TentCampersTotal,RVCampersTotal,BackcountryTotal,NonRecreationOvernightStaysTotal,MiscellaneousOvernightStaysTotal
7555,Zion NP,ZION,National Park,Intermountain,UT,2021,8,466928,1860,3105093,...,21900,31553998,5475,55454,0,138730,89919,15071,0,7969
7556,Zion NP,ZION,National Park,Intermountain,UT,2021,9,483180,1800,3076210,...,21900,31553998,5475,55454,0,138730,89919,15071,0,7969
7557,Zion NP,ZION,National Park,Intermountain,UT,2021,10,469379,1860,2902495,...,21900,31553998,5475,55454,0,138730,89919,15071,0,7969
7558,Zion NP,ZION,National Park,Intermountain,UT,2021,11,308560,1800,1932994,...,21900,31553998,5475,55454,0,138730,89919,15071,0,7969
7559,Zion NP,ZION,National Park,Intermountain,UT,2021,12,211983,1860,1321213,...,21900,31553998,5475,55454,0,138730,89919,15071,0,7969


In [6]:
# Print shape

df.shape

(7560, 35)

In [7]:
# Getting Full List of Columns & Gauge of Data Types

df.dtypes

ParkName                            object
UnitCode                            object
ParkType                            object
Region                              object
State                               object
Year                                 int64
Month                                int64
RecreationVisits                    object
NonRecreationVisits                 object
RecreationHours                     object
NonRecreationHours                  object
ConcessionerLodging                 object
ConcessionerCamping                 object
TentCampers                         object
RVCampers                           object
Backcountry                         object
NonRecreationOvernightStays         object
MiscellaneousOvernightStays         object
ParkNameTotal                       object
UnitCodeTotal                       object
ParkTypeTotal                       object
RegionTotal                         object
StateTotal                          object
YearTotal  

## 03. Data Wrangling

#### 3.1 Dropping Columns

In [8]:
# The "total" columns are not needed and cause more confusion than needed. Also, 'ParkType', 'ConcessionerLodging', 
#'ConcessionerCamping', 'NonRecreationOvernightStays' & 'MiscellaneousOverNightStays' don't contribute to the park usage stats 
# that I'm intersted in after reading the stats definition document.

df = df.drop(columns = ['ParkType', 'ConcessionerLodging', 'ConcessionerCamping', 'ParkNameTotal', 'UnitCodeTotal', 
                        'ParkTypeTotal', 'RegionTotal', 'StateTotal', 'YearTotal', 'RecreationVisitsTotal', 
                        'NonRecreationVisitsTotal', 'RecreationHoursTotal', 'ConcessionerLodgingTotal', 
                        'ConcessionerCampingTotal', 'TentCampersTotal', 'RVCampersTotal', 'BackcountryTotal', 
                        'NonRecreationOvernightStaysTotal', 'MiscellaneousOvernightStaysTotal', 'NonRecreationHoursTotal', 
                        'NonRecreationOvernightStays', 'MiscellaneousOvernightStays'])

In [9]:
# Checking column drops were successful

df.dtypes

ParkName               object
UnitCode               object
Region                 object
State                  object
Year                    int64
Month                   int64
RecreationVisits       object
NonRecreationVisits    object
RecreationHours        object
NonRecreationHours     object
TentCampers            object
RVCampers              object
Backcountry            object
dtype: object

#### 3.2 Data Consistency Checks

In [10]:
# Checking for columns with mixed data types

for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

##### There are no mixed data types in this dataframe.

In [11]:
# Checking for missing values

df.isnull().sum()

ParkName               0
UnitCode               0
Region                 0
State                  0
Year                   0
Month                  0
RecreationVisits       0
NonRecreationVisits    0
RecreationHours        0
NonRecreationHours     0
TentCampers            0
RVCampers              0
Backcountry            0
dtype: int64

##### There are no missing values in this dataframe.

In [12]:
# Checking for duplicates (creating a new subset that will contain only duplicates)

df_dups = df[df.duplicated()]

In [13]:
# Printing df_dups

df_dups

Unnamed: 0,ParkName,UnitCode,Region,State,Year,Month,RecreationVisits,NonRecreationVisits,RecreationHours,NonRecreationHours,TentCampers,RVCampers,Backcountry


##### No duplicates to report.

#### 3.3 Basic Statistics & Data Type Changing

In [14]:
# Checking Basic Statistics after consistency checks

df.describe()

Unnamed: 0,Year,Month
count,7560.0,7560.0
mean,2016.5,6.5
std,2.872471,3.452281
min,2012.0,1.0
25%,2014.0,3.75
50%,2016.5,6.5
75%,2019.0,9.25
max,2021.0,12.0


In [15]:
# Realization that in order to change variables to int64, I need to remove the commas from all numerical data.

# Removing all commas

df['RecreationVisits'] = df['RecreationVisits'].str.replace(',','')

In [16]:
# Checking that comma removal worked

df['RecreationVisits']

0        11930
1        12388
2        22684
3        59471
4       157406
         ...  
7555    466928
7556    483180
7557    469379
7558    308560
7559    211983
Name: RecreationVisits, Length: 7560, dtype: object

In [17]:
# Variable data types need to be altered to pull statistics

df['RecreationVisits'] = df['RecreationVisits'].astype('int64')

In [18]:
# Completing comma removal for rest of variables

df['NonRecreationVisits'] = df['NonRecreationVisits'].str.replace(',','')

In [19]:
df['RecreationHours'] = df['RecreationHours'].str.replace(',','')

In [20]:
df['NonRecreationHours'] = df['NonRecreationHours'].str.replace(',','')

In [21]:
df['TentCampers'] = df['TentCampers'].str.replace(',','')

In [22]:
df['RVCampers'] = df['RVCampers'].str.replace(',','')

In [23]:
df['Backcountry'] = df['Backcountry'].str.replace(',','')

In [24]:
# Completing changing variable data types to pull statistics

df['NonRecreationVisits'] = df['NonRecreationVisits'].astype('int64')

In [25]:
df['RecreationHours'] = df['RecreationHours'].astype('int64')

In [26]:
df['NonRecreationHours'] = df['NonRecreationHours'].astype('int64')

In [27]:
df['TentCampers'] = df['TentCampers'].astype('int64')

In [28]:
df['RVCampers'] = df['RVCampers'].astype('int64')

In [29]:
df['Backcountry'] = df['Backcountry'].astype('int64')

In [30]:
df['Year'] = df['Year'].astype('str')

In [31]:
df['Month'] = df['Month'].astype('str')

In [32]:
# Checking that datatype updates were correct

df.dtypes

ParkName               object
UnitCode               object
Region                 object
State                  object
Year                   object
Month                  object
RecreationVisits        int64
NonRecreationVisits     int64
RecreationHours         int64
NonRecreationHours      int64
TentCampers             int64
RVCampers               int64
Backcountry             int64
dtype: object

In [33]:
# Rechecking Basic Statistics after consistency checks

df.describe()

Unnamed: 0,RecreationVisits,NonRecreationVisits,RecreationHours,NonRecreationHours,TentCampers,RVCampers,Backcountry
count,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0,7560.0
mean,107189.5,31476.85,903346.7,31476.85,3263.246032,1853.780159,1758.758333
std,187292.8,123362.5,2129197.0,123362.5,9027.343838,4882.539775,5362.823389
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10435.25,0.0,48584.5,0.0,0.0,0.0,0.0
50%,42279.5,291.5,177915.0,291.5,78.0,0.0,84.0
75%,115002.2,4900.0,648552.8,4900.0,1717.75,1168.0,621.25
max,1739720.0,1334690.0,20212390.0,1334690.0,145536.0,78060.0,62848.0


# 04. Export Data

In [34]:
# Export df to Prepd Data folder

df.to_csv(os.path.join(path, 'Prepd Data', 'NatlParks_clean.csv'))