In [48]:
from pathlib import Path
import pandas as pd
import numpy as np

In [118]:
# Create base paths to the data directories
raw_data_path = Path('../data/raw')
processed_data_path = Path('../data/processed')

# Read in the provided parks data
df = pd.read_csv(raw_data_path / 'parks-data.csv')

In [4]:
df.head()

Unnamed: 0,Date,Springfield Oaks Golf Course Golf Rounds Played,Springfield Oaks Golf Course Golf Revenue,Glen Oaks Golf Course Golf Rounds Played,Glen Oaks Golf Course Golf Revenue,Red Oaks Waterpark Visitors,Red Oaks Waterpark Revenue,Waterford Oaks Waterpark Visitors,Waterford Oaks Waterpark Revenue,Groveland Oaks Campers,Groveland Oaks Campground Revenue,Addison Oaks Campers,Addison Oaks Campground Revenue,Temperature (F),Precipitation (inches)
0,6/1/2022,37,1184,32,1248,134,402,213,852,9,108,10,80,75,0.42
1,6/2/2022,45,1485,42,1344,287,1148,329,1316,13,169,11,110,90,0.07
2,6/3/2022,22,880,26,962,170,680,178,534,11,143,5,50,82,0.36
3,6/4/2022,23,782,30,1050,105,315,164,820,8,88,10,100,77,0.35
4,6/5/2022,46,1794,43,1376,380,1140,289,1156,13,169,17,204,87,0.14


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 15 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Date                                             276 non-null    object 
 1   Springfield Oaks Golf Course Golf Rounds Played  276 non-null    int64  
 2   Springfield Oaks Golf Course Golf Revenue        276 non-null    int64  
 3   Glen Oaks Golf Course Golf Rounds Played         276 non-null    int64  
 4   Glen Oaks Golf Course Golf Revenue               276 non-null    int64  
 5   Red Oaks Waterpark Visitors                      276 non-null    int64  
 6   Red Oaks Waterpark Revenue                       276 non-null    int64  
 7   Waterford Oaks Waterpark Visitors                276 non-null    int64  
 8   Waterford Oaks Waterpark Revenue                 276 non-null    int64  
 9   Groveland Oaks Campers          

## Data Cleaning
Columns will be renamed to be more concise and program friendly. There are no Null values and no text data to clean. Dataset will be split into dataframes for each type of facility and saved into separate csv files. These include:
- Golf Courses
- Waterparks
- Campgrounds

Precipitation and temperature data will also be split into its own dataframe and csv file.
All facility data will be transformed into a long version which may be more useful for analysis and plotting. 

In [121]:
def tweak_df(df: pd.DataFrame) -> pd.DataFrame:
    return (df
            .rename(columns={'Date':'date',
                             'Springfield Oaks Golf Course Golf Rounds Played':'springfield_oaks_golf_rounds_played',
                             'Springfield Oaks Golf Course Golf Revenue': 'springfield_oaks_golf_revenue',
                             'Glen Oaks Golf Course Golf Rounds Played': 'glen_oaks_golf_rounds_played',
                             'Glen Oaks Golf Course Golf Revenue': 'glen_oaks_golf_revenue',
                             'Red Oaks Waterpark Visitors': 'red_oaks_waterpark_visitors',
                             'Red Oaks Waterpark Revenue': 'red_oaks_waterpark_revenue',
                             'Waterford Oaks Waterpark Visitors': 'waterford_oaks_waterpark_visitors',
                             'Waterford Oaks Waterpark Revenue': 'waterford_oaks_waterpark_revenue',
                             'Groveland Oaks Campers': 'groveland_oaks_campers',
                             'Groveland Oaks Campground Revenue': 'groveland_oaks_campground_revenue',
                             'Addison Oaks Campers': 'addison_oaks_campers',
                             'Addison Oaks Campground Revenue': 'addison_oaks_campground_revenue',
                             'Temperature (F)': 'temp_f',
                             'Precipitation (inches)': 'precip_in',})
            .assign(date=lambda x: pd.to_datetime(x['date'], format='%m/%d/%Y')
            )
    )
    
    
parks_data = tweak_df(df)
parks_data.head()

Unnamed: 0,date,springfield_oaks_golf_rounds_played,springfield_oaks_golf_revenue,glen_oaks_golf_rounds_played,glen_oaks_golf_revenue,red_oaks_waterpark_visitors,red_oaks_waterpark_revenue,waterford_oaks_waterpark_visitors,waterford_oaks_waterpark_revenue,groveland_oaks_campers,groveland_oaks_campground_revenue,addison_oaks_campers,addison_oaks_campground_revenue,temp_f,precip_in
0,2022-06-01,37,1184,32,1248,134,402,213,852,9,108,10,80,75,0.42
1,2022-06-02,45,1485,42,1344,287,1148,329,1316,13,169,11,110,90,0.07
2,2022-06-03,22,880,26,962,170,680,178,534,11,143,5,50,82,0.36
3,2022-06-04,23,782,30,1050,105,315,164,820,8,88,10,100,77,0.35
4,2022-06-05,46,1794,43,1376,380,1140,289,1156,13,169,17,204,87,0.14


In [123]:
# Create a separate dataframe for weather data
weather = parks_data[['date', 'temp_f', 'precip_in']]
weather.head()

Unnamed: 0,date,temp_f,precip_in
0,2022-06-01,75,0.42
1,2022-06-02,90,0.07
2,2022-06-03,82,0.36
3,2022-06-04,77,0.35
4,2022-06-05,87,0.14


In [125]:
def create_facility_df(df: pd.DataFrame, facility_substr: str) -> pd.DataFrame:
    cols = df.columns[df.columns.str.contains(facility_substr, case=False)]
    return df[['date'] + cols.tolist()]


golf = create_facility_df(parks_data, 'golf')
waterpark = create_facility_df(parks_data, 'waterpark')
campground = create_facility_df(parks_data, 'camp')  
golf.head()

Unnamed: 0,date,springfield_oaks_golf_rounds_played,springfield_oaks_golf_revenue,glen_oaks_golf_rounds_played,glen_oaks_golf_revenue
0,2022-06-01,37,1184,32,1248
1,2022-06-02,45,1485,42,1344
2,2022-06-03,22,880,26,962
3,2022-06-04,23,782,30,1050
4,2022-06-05,46,1794,43,1376


In [126]:
waterpark.head()

Unnamed: 0,date,red_oaks_waterpark_visitors,red_oaks_waterpark_revenue,waterford_oaks_waterpark_visitors,waterford_oaks_waterpark_revenue
0,2022-06-01,134,402,213,852
1,2022-06-02,287,1148,329,1316
2,2022-06-03,170,680,178,534
3,2022-06-04,105,315,164,820
4,2022-06-05,380,1140,289,1156


In [127]:

campground.head()

Unnamed: 0,date,groveland_oaks_campers,groveland_oaks_campground_revenue,addison_oaks_campers,addison_oaks_campground_revenue
0,2022-06-01,9,108,10,80
1,2022-06-02,13,169,11,110
2,2022-06-03,11,143,5,50
3,2022-06-04,8,88,10,100
4,2022-06-05,13,169,17,204


In [108]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    return (df
            .melt(id_vars='date', var_name='intermediate', value_name='value')
            .assign(park_name=lambda df_: df_['intermediate'].str.split('_').str[0:2].str.join('_'),
                    variable=lambda df_: df_['intermediate'].str.split('_').str[2:].str.join('_'))
            .drop(columns='intermediate')
            .pivot(index=['date', 'park_name'], columns='variable', values='value')
            .reset_index(level='park_name')
    )
    

campground_long = wide_to_long(campground)
golf_long = wide_to_long(golf)
waterpark_long = wide_to_long(waterpark)

In [129]:
waterpark_long

variable,park_name,waterpark_revenue,waterpark_visitors
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-06-01,red_oaks,402,134
2022-06-01,waterford_oaks,852,213
2022-06-02,red_oaks,1148,287
2022-06-02,waterford_oaks,1316,329
2022-06-03,red_oaks,680,170
...,...,...,...
2024-08-29,waterford_oaks,1084,271
2024-08-30,red_oaks,570,114
2024-08-30,waterford_oaks,1032,258
2024-08-31,red_oaks,1280,256


In [130]:
# Save the dataframes to csv files
parks_data.to_csv(processed_data_path / 'parks-data.csv', index=False)
weather.to_csv(processed_data_path / 'weather.csv', index=False)

campground.to_csv(processed_data_path / 'campground.csv', index=False)
golf.to_csv(processed_data_path / 'golf.csv', index=False)
waterpark.to_csv(processed_data_path / 'waterpark.csv', index=False)

campground_long.to_csv(processed_data_path / 'campground-long.csv', index=False)
golf_long.to_csv(processed_data_path / 'golf-long.csv', index=False)
waterpark_long.to_csv(processed_data_path / 'waterpark-long.csv', index=False)