In [8]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from datetime import datetime, timedelta

In [601]:
class visitClean:
    def __init__(self, data):
        self.data = data

    def getDate(self, year, week):
        if week == 1:
            r = year % 7
            d = (12 - r) % 7
            if d == 0:
                return datetime(year, 6, 7)
            else:
                return datetime(year, 6, d)
        else:
            return self.getDate(year, week - 1) + timedelta(weeks = 1)

    def fromWeeksToDates(self):
        self.data['Dates'] = self.data.apply(lambda row: self.getDate(row['Year'], row['Week']), axis=1)
        self.data = self.data.drop(columns=['Year', 'Week'])
        self.data = self.data[['Dates'] + [col for col in self.data.columns if col != 'Dates']]

    def getResort(self, i):
        self.data = self.data.iloc[:, [0, i]]

In [603]:
class climateClean:
    def __init__(self, data):
        self.data = data

    def fromYMDtoDates(self):
        self.data['Dates'] = self.data.apply(lambda row: datetime(int(row['Year']), int(row['Month']), int(row['Day'])), axis=1)
        self.data.drop(columns=['Year', 'Month', 'Day'], inplace=True)
        self.data = self.data[['Dates'] + [col for col in self.data.columns if col != 'Dates']]

    def filterStation(self, station):
        self.data = self.data[self.data['Bureau of Meteorology station number'] == station]

    def getAve(self):
        self.data['WeekStart'] = self.data['Dates'] - pd.to_timedelta(self.data['Dates'].dt.weekday, unit='d')
        
        weekly = self.data.groupby('WeekStart').agg({
            'Maximum temperature (Degree C)': 'mean',
            'Minimum temperature (Degree C)': 'mean',
            'Rainfall amount (millimetres)': 'sum'
        }).rename(columns={
            'Maximum temperature (Degree C)': 'MaxTemp',
            'Minimum temperature (Degree C)': 'MinTemp',
            'Rainfall amount (millimetres)': 'Rain'
        }).reset_index()
        
        self.data = self.data.merge(weekly, on='WeekStart', how='left')
        
        self.data['Maximum temperature (Degree C)'] = self.data['MaxTemp']
        self.data['Minimum temperature (Degree C)'] = self.data['MinTemp']
        self.data['Rainfall amount (millimetres)'] = self.data['Rain']

        self.data.drop(columns=['WeekStart', 'MaxTemp', 'MinTemp', 'Rain'], inplace=True)


    def keepRelevantData(self, data):
        self.data = self.data[self.data['Dates'].isin(data['Dates'])]

    def combineData(self, data):
        self.data = pd.DataFrame(np.hstack([data, self.data.drop(['Dates'], axis = 1)]))

In [605]:
class Clean:
    def __init__(self, data):
        self.data = data

    def aboveYear(self, year):
        self.data = self.data[self.data[0].dt.year >= year]

In [607]:
xls = pd.ExcelFile("2025 Allianz Datathon Dataset.xlsx")
xls.sheet_names

['readme', 'Visitation Data', 'Climate Data']

In [608]:
visitDataRaw = xls.parse('Visitation Data')
baw = visitClean(visitDataRaw)

In [609]:
baw.fromWeeksToDates()

In [610]:
baw.getResort(1)


In [611]:
baw.data

Unnamed: 0,Dates,Mt. Baw Baw
0,2014-06-07,555
1,2014-06-14,804
2,2014-06-21,993
3,2014-06-28,2976
4,2014-07-05,11112
...,...,...
160,2024-08-13,5977
161,2024-08-20,3597
162,2024-08-27,1500
163,2024-09-03,0


In [612]:
climateDataRaw = xls.parse('Climate Data')
climateData = climateClean(climateDataRaw)
climateData.filterStation(85291)

In [613]:
climateData.fromYMDtoDates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['Dates'] = self.data.apply(lambda row: datetime(int(row['Year']), int(row['Month']), int(row['Day'])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.drop(columns=['Year', 'Month', 'Day'], inplace=True)


In [619]:
climateData.getAve()

In [620]:
climateData.keepRelevantData(baw.data)

In [621]:
climateData.data

Unnamed: 0,Dates,Bureau of Meteorology station number,Maximum temperature (Degree C),Minimum temperature (Degree C),Rainfall amount (millimetres)
1618,2014-06-07,85291,5.100000,1.042857,127.4
1625,2014-06-14,85291,4.957143,1.100000,24.4
1632,2014-06-21,85291,5.485714,0.814286,21.0
1639,2014-06-28,85291,1.042857,-1.814286,76.4
1646,2014-07-05,85291,2.528571,-0.957143,82.0
...,...,...,...,...,...
5338,2024-08-13,85291,7.885714,3.383333,11.4
5345,2024-08-20,85291,7.042857,2.128571,19.4
5352,2024-08-27,85291,4.685714,0.342857,25.6
5359,2024-09-03,85291,7.071429,-0.214286,48.4


In [627]:
climateData.combineData(baw.data)

In [629]:
cleanData = Clean(pd.DataFrame(climateData.data).drop(2, axis = 1).dropna())

In [633]:
cleanData.aboveYear(2020)

In [635]:
cleanData.data

Unnamed: 0,0,1,3,4,5
90,2020-06-01,1074,3.06,-1.671429,18.2
91,2020-06-08,544,5.6,-0.38,17.8
92,2020-06-15,603,4.142857,0.271429,18.6
93,2020-06-22,2168,2.242857,-1.342857,59.2
94,2020-06-29,4055,3.283333,-1.55,55.8
...,...,...,...,...,...
160,2024-08-13,5977,7.885714,3.383333,11.4
161,2024-08-20,3597,7.042857,2.128571,19.4
162,2024-08-27,1500,4.685714,0.342857,25.6
163,2024-09-03,0,7.071429,-0.214286,48.4
