## Data Cleaning
Data provided is in a wide-format and needs to be converted to a [tidy long-format](https://vita.had.co.nz/papers/tidy-data.html). Weather date will also be split off into own file. Both these changes will make the data easier to work with in Python and Tableau.

All processed dataframes will be saved as CSV files under data/processed/ directory.

In [83]:
from pathlib import Path
import pandas as pd

In [84]:
# Create base paths to the data directories
raw_data_path = Path('../data/raw')
processed_data_path = Path('../data/processed')

# Read in the provided parks data
df = pd.read_csv(raw_data_path / 'parks-data.csv')
df.head()


Unnamed: 0,Date,Springfield Oaks Golf Course Golf Rounds Played,Springfield Oaks Golf Course Golf Revenue,Glen Oaks Golf Course Golf Rounds Played,Glen Oaks Golf Course Golf Revenue,Red Oaks Waterpark Visitors,Red Oaks Waterpark Revenue,Waterford Oaks Waterpark Visitors,Waterford Oaks Waterpark Revenue,Groveland Oaks Campers,Groveland Oaks Campground Revenue,Addison Oaks Campers,Addison Oaks Campground Revenue,Temperature (F),Precipitation (inches)
0,6/1/2022,37,1184,32,1248,134,402,213,852,9,108,10,80,75,0.42
1,6/2/2022,45,1485,42,1344,287,1148,329,1316,13,169,11,110,90,0.07
2,6/3/2022,22,880,26,962,170,680,178,534,11,143,5,50,82,0.36
3,6/4/2022,23,782,30,1050,105,315,164,820,8,88,10,100,77,0.35
4,6/5/2022,46,1794,43,1376,380,1140,289,1156,13,169,17,204,87,0.14


In [85]:
# Initial data cleaning

def tweak_df(df: pd.DataFrame) -> pd.DataFrame:
    return (df
            .rename(columns={'Date':'date',
                             'Springfield Oaks Golf Course Golf Rounds Played':'springfield_oaks_golf_rounds_played',
                             'Springfield Oaks Golf Course Golf Revenue': 'springfield_oaks_golf_revenue',
                             'Glen Oaks Golf Course Golf Rounds Played': 'glen_oaks_golf_rounds_played',
                             'Glen Oaks Golf Course Golf Revenue': 'glen_oaks_golf_revenue',
                             'Red Oaks Waterpark Visitors': 'red_oaks_waterpark_visitors',
                             'Red Oaks Waterpark Revenue': 'red_oaks_waterpark_revenue',
                             'Waterford Oaks Waterpark Visitors': 'waterford_oaks_waterpark_visitors',
                             'Waterford Oaks Waterpark Revenue': 'waterford_oaks_waterpark_revenue',
                             'Groveland Oaks Campers': 'groveland_oaks_campers',
                             'Groveland Oaks Campground Revenue': 'groveland_oaks_campground_revenue',
                             'Addison Oaks Campers': 'addison_oaks_campers',
                             'Addison Oaks Campground Revenue': 'addison_oaks_campground_revenue',
                             'Temperature (F)': 'temp_f',
                             'Precipitation (inches)': 'precip_in',})
            .assign(date=lambda x: pd.to_datetime(x['date'], format='%m/%d/%Y')
            )
    )
      
parks_data = tweak_df(df)
parks_data.head()
         

Unnamed: 0,date,springfield_oaks_golf_rounds_played,springfield_oaks_golf_revenue,glen_oaks_golf_rounds_played,glen_oaks_golf_revenue,red_oaks_waterpark_visitors,red_oaks_waterpark_revenue,waterford_oaks_waterpark_visitors,waterford_oaks_waterpark_revenue,groveland_oaks_campers,groveland_oaks_campground_revenue,addison_oaks_campers,addison_oaks_campground_revenue,temp_f,precip_in
0,2022-06-01,37,1184,32,1248,134,402,213,852,9,108,10,80,75,0.42
1,2022-06-02,45,1485,42,1344,287,1148,329,1316,13,169,11,110,90,0.07
2,2022-06-03,22,880,26,962,170,680,178,534,11,143,5,50,82,0.36
3,2022-06-04,23,782,30,1050,105,315,164,820,8,88,10,100,77,0.35
4,2022-06-05,46,1794,43,1376,380,1140,289,1156,13,169,17,204,87,0.14


In [86]:
# Split off weather data from the main parks data
weather = parks_data[['date', 'temp_f', 'precip_in']]
parks_data = parks_data.drop(columns=['temp_f', 'precip_in'])
weather.head()

Unnamed: 0,date,temp_f,precip_in
0,2022-06-01,75,0.42
1,2022-06-02,90,0.07
2,2022-06-03,82,0.36
3,2022-06-04,77,0.35
4,2022-06-05,87,0.14


In [87]:
# Converting parks data to a long format
def parks_data_wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    return (df
            .melt(id_vars='date', var_name='intermediate', value_name='value')
            .assign(park_name=lambda df_: (df_['intermediate']
                                   .str.split('_')
                                   .str[0:2]
                                   .str.join(' ').str.title()
                                   ),
                    facility=lambda df_: df_['intermediate'].apply(lambda v: ('golf' if 'golf' in v
                                                                      else 'waterpark' if 'waterpark'in v
                                                                      else 'campground'
                                                                      )
                                                           ),
                    variable=lambda df_: df_['intermediate'].apply(lambda v: ('rounds played' if 'rounds' in v
                                                                      else 'revenue' if 'revenue' in v
                                                                      else 'visitors' if 'visitors' in v
                                                                      else 'campers')
                                                           )
            )
    .drop(columns=['intermediate'])
    .sort_values(['date', 'facility', 'variable'])
    .reset_index(drop=True)
    .reindex(columns=['date', 'park_name', 'facility', 'variable', 'value'])
    )  

parks_data_long = parks_data_wide_to_long(parks_data)
parks_data_long.head()

Unnamed: 0,date,park_name,facility,variable,value
0,2022-06-01,Groveland Oaks,campground,campers,9
1,2022-06-01,Addison Oaks,campground,campers,10
2,2022-06-01,Groveland Oaks,campground,revenue,108
3,2022-06-01,Addison Oaks,campground,revenue,80
4,2022-06-01,Springfield Oaks,golf,revenue,1184


In [88]:
# Repivot the data to a wide format that is more consistent with our new long dataframe
parks_data_pivot = parks_data_long.pivot(index=['date', 'park_name'], columns='variable', values='value').reset_index()
parks_data_pivot.head()

variable,date,park_name,campers,revenue,rounds played,visitors
0,2022-06-01,Addison Oaks,10.0,80.0,,
1,2022-06-01,Glen Oaks,,1248.0,32.0,
2,2022-06-01,Groveland Oaks,9.0,108.0,,
3,2022-06-01,Red Oaks,,402.0,,134.0
4,2022-06-01,Springfield Oaks,,1184.0,37.0,


In [10]:
# Save processed data to processed data directory
parks_data_long.to_csv(processed_data_path / 'parks-data-long.csv', index=False)
parks_data_pivot.to_csv(processed_data_path / 'parks-data-pivot.csv', index=False)
weather.to_csv(processed_data_path / 'weather.csv', index=False)

In [17]:
parks_data_revenue = parks_data_long[parks_data_long['variable'] == 'revenue']

In [19]:
parks_data_revenue.to_csv(processed_data_path / 'parks-data-revenue.csv', index=False)

In [20]:
def pivot(df: pd.DataFrame) -> pd.DataFrame:
    # Helper function for properly pivoting data
    index = ['date', 'park_name']
    columns = 'variable'
    values = 'value'
    return (df
            .pivot(index=index, columns=columns, values=values)
            .reset_index()
            .assign(date=lambda x: pd.to_datetime(x['date']),
                    month=lambda x: x['date'].dt.month_name(),
                    year=lambda x: x['date'].dt.year,)
            .set_index('date')
    )

golf = pivot(parks_data[parks_data['facility'] == 'golf'])
waterpark = pivot(parks_data[parks_data['facility'] == 'waterpark'])
campground = pivot(parks_data[parks_data['facility'] == 'campground'])

KeyError: 'facility'

In [None]:
# Read in original data
# Split off weather data from parks data
# Transform parks data into long format
# Create a revenue dataframe for all parks and facilities 
# Create a user/visitation dataframe for each facility
# Save all dataframes to processed data directory

# Separate EDA for each facility type: golf, waterpark, and campground
# Tableau dashboard for revenue of all parks and facilities

# Assumptions Being Made:
# 1. Data is simulated and not real data given that it includes dates in the future and is being provided
# to multiple candidates
# 2. Data may be randomly generated with not underlying correlations, and man purpose is to demonstrate data handling
# and visualiztion skills and so primary focuse for this project. Given time constraints the 
# primarty focus will be on the presentation of summary statistcs and trend visualizations as opposed to statistical modeling.
# 3. Golf course rounds played may not be the same as number of visitors. So to avoid any apples to oranges comparisons with
# campers and waterpark visitor numbers we will separatge this and make a Tableau dashboard that is for revenue only.
# 4. If there is more time will perform individual EDA for each facility type in Python, and present each separatlely.