## Install package requirements and import dependencies

In [2]:
!pip install -r requirements.txt --quiet

import openmeteo_requests
import numpy as np
from dotenv import load_dotenv
import pandas as pd
import requests_cache
import subprocess
from retry_requests import retry
from io import StringIO
import hopsworks
import great_expectations as ge
from datetime import date
import json
import time
import statistics
import time
import copy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


#### Enable debug mode?

In [3]:
debug = False

## Load environment variables from the .env file

In [4]:
load_dotenv()

True

## Connect to hopsworks

In [5]:
project = hopsworks.login()

2026-01-03 14:37:49,704 INFO: Initializing external client
2026-01-03 14:37:49,704 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-03 14:37:51,815 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271967


In [6]:
fs = project.get_feature_store()

### Define expectation suites for ski weather

In [7]:
min_year = 1940
max_year = date.today().year - 1
# the columns for the pandas dataframe which this notebook will upload to hopsworks
yw_columns = ["date", "ski_resort_id", "closed", "mean_week_temperature"]

In [8]:
id_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"ski_resort_id",
        "min_value":0,
        "max_value": 1000000000
    }
)

temperature_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"mean_week_temperature",
        "min_value":-100,
        "max_value":20
    }
)

temperature_expectation_2 = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_max_to_be_between",
    kwargs={
        "column":"mean_week_temperature",
        "min_value":0,
        "max_value":100
    }
)

In [9]:
ski_weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="ski_weather_expectation_suite"
)
ski_weather_expectation_suite.add_expectation(id_expectation)
ski_weather_expectation_suite.add_expectation(temperature_expectation)
ski_weather_expectation_suite.add_expectation(temperature_expectation_2)

{"expectation_type": "expect_column_max_to_be_between", "kwargs": {"column": "mean_week_temperature", "min_value": 0, "max_value": 100}, "meta": {}}

In [10]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
meteo_url = "https://archive-api.open-meteo.com/v1/archive"

## synchronize former ski resorts data

In [11]:
closed_resorts_fg = fs.get_feature_group(name='former_resorts', version=1)
cr_df = closed_resorts_fg.read(dataframe_type="pandas")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.04s) 


In [12]:
cr_df

Unnamed: 0,id,name,year_closed,latitude,longitude
0,262,Lutsen North Shore Ski Area,1989,47.6676,-90.7590
1,159,Schroon Lake Ski Center,1975,43.8081,-73.7594
2,186,Manning Park Ski Area,1982,48.7770,-121.4070
3,131,Mount Whittier Ski Area,1985,43.8000,-71.2000
4,523,Bielatal Ski Area,1999,50.8800,14.2100
...,...,...,...,...,...
236,216,Mount Joseph Ski Area,1984,44.3721,-72.3459
237,171,Chalet Ski Area,1985,43.7086,-85.0530
238,43,Bovensmolen Ski Resort,2005,50.9083,5.8333
239,284,White Birch Ski Area,1976,43.9712,-71.7084


### Get start year and end year

For measuring temperature. Every closed down ski resort

In [13]:
cr_df.insert(2, "start_year", min_year)
cr_df.insert(4, "end_year", max_year)

In [14]:
# set start_year and end_year for every resort
for i, resort in cr_df.iterrows():
    resort["start_year"] = resort["year_closed"] - 15
    resort["end_year"] = resort["year_closed"] + 15

    # enforce year interval to be in between dates which have data on open-meteo
    if resort["start_year"] < min_year:
        resort["start_year"] = min_year
    if resort["end_year"] > max_year:
        resort["end_year"] = max_year

    cr_df.loc[cr_df['id']==resort["id"], 'start_year'] = resort["start_year"]
    cr_df.loc[cr_df['id']==resort["id"], 'end_year'] = resort["end_year"]

print(cr_df)

      id                                  name  start_year  year_closed  \
0    262           Lutsen North Shore Ski Area        1974         1989   
1    159               Schroon Lake Ski Center        1960         1975   
2    186                 Manning Park Ski Area        1967         1982   
3    131               Mount Whittier Ski Area        1970         1985   
4    523                     Bielatal Ski Area        1984         1999   
..   ...                                   ...         ...          ...   
236  216                 Mount Joseph Ski Area        1969         1984   
237  171                       Chalet Ski Area        1970         1985   
238   43                Bovensmolen Ski Resort        1990         2005   
239  284                  White Birch Ski Area        1961         1976   
240  248  Whitefish Mountain Resort (Old Area)        1965         1980   

     end_year  latitude  longitude  
0        2004   47.6676   -90.7590  
1        1990   43.8081  

## Synchronize current ski resort data

In [15]:
current_resorts_fg = fs.get_feature_group(name='current_resorts', version=1)
or_df = current_resorts_fg.read(dataframe_type="pandas")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.26s) 


In [16]:
or_df

Unnamed: 0,id,name,latitude,longitude
0,1226505097,Torgnon,45.814452,7.554285
1,601135063,Font d'Urle Chaud Clapier,44.910152,5.323491
2,1254287966,Ristolas en Queyras,44.771783,6.960893
3,601115623,Alpe Devero,46.307671,8.252052
4,7752047,San Martino di Castrozza - Passo Rolle,46.268927,11.792439
...,...,...,...,...
834,45409595,Antagnod,45.822300,7.682800
835,1227121146,Gitschenen – Isenthal,46.899355,8.501497
836,601131935,Saint Luc - Chandolin,46.236511,7.625363
837,642545662,Seefeld - Gschwandtkopf,47.317186,11.171586


In [17]:
# only fetch the first 100 resorts. Temporary change so that we don't run out of api requests
for i in range(100, len(or_df.id-100)):
    or_df = or_df.drop([i])

In [18]:
or_df

Unnamed: 0,id,name,latitude,longitude
0,1226505097,Torgnon,45.814452,7.554285
1,601135063,Font d'Urle Chaud Clapier,44.910152,5.323491
2,1254287966,Ristolas en Queyras,44.771783,6.960893
3,601115623,Alpe Devero,46.307671,8.252052
4,7752047,San Martino di Castrozza - Passo Rolle,46.268927,11.792439
...,...,...,...,...
95,542214331,La Thuile,45.694910,6.919895
96,1130708403,Raten,47.139807,8.667219
97,1353981911,Skiweltcup Garmisch-Kandahar Rennen,47.470066,11.060259
98,540665061,Santa Caterina,46.398667,10.488794


## Prepare request to open meteo

For closed resorts, we call open-meteo once for every year from 1940 to the year before the current year. For each year call, we specify what ski resorts (coordinates) that need temperature measurements for the winter season.

For open resorts, we call for every year the last 15 years.

In [19]:
# for each year, get all resorts which need temperature for each date
def get_years_lat_long(start_year: int, end_year: int, resort_df: pd.DataFrame, closed: bool):
    years = range(start_year, end_year)
    years_lat_long = pd.DataFrame()
    for year in years:
        pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31")
        ids = []
        latitudes = []
        longitudes = []
        for i, resort in resort_df.iterrows():
            # open resorts
            if(not closed):
                ids.append(resort["id"])
                latitudes.append(resort["latitude"])
                longitudes.append(resort["longitude"])
            # closed resorts
            elif year >= resort["start_year"] and year <= resort["end_year"]:
                ids.append(resort["id"])
                latitudes.append(resort["latitude"])
                longitudes.append(resort["longitude"])
    
        years_lat_long_elem = { 
             "year": year,
             "ski_resort_ids": ids,
             "latitudes": latitudes, 
             "longitudes": longitudes
        }
        years_lat_long_elem = pd.DataFrame(years_lat_long_elem)
        if years_lat_long.empty:
            years_lat_long = years_lat_long_elem
        else:
            years_lat_long = pd.concat([years_lat_long, years_lat_long_elem])
    
    if debug:
        years_lat_long_json = json.dumps(years_lat_long)
        print(years_lat_long_json)
    return years_lat_long

### Get latitudes and longitudes for dates for open and closed resorts

In [20]:
closed_years_lat_long = get_years_lat_long(min_year, max_year+1, cr_df, True)
closed_years_lat_long

Unnamed: 0,year,ski_resort_ids,latitudes,longitudes
0,1941,151,46.6378,-121.3910
0,1942,151,46.6378,-121.3910
0,1943,151,46.6378,-121.3910
0,1944,151,46.6378,-121.3910
0,1945,151,46.6378,-121.3910
...,...,...,...,...
10,2025,129,44.2022,-72.9411
11,2025,125,42.8128,-76.0214
12,2025,841,44.5000,6.1500
13,2025,306,43.0059,-72.2193


In [21]:
temp = closed_years_lat_long[closed_years_lat_long["year"] == 1979]
temp.size

624

In [22]:
num_hist_years_or = 30
open_years_lat_long = get_years_lat_long(max_year-num_hist_years_or, max_year+1, or_df, False)
open_years_lat_long

Unnamed: 0,year,ski_resort_ids,latitudes,longitudes
0,1995,1226505097,45.814452,7.554285
1,1995,601135063,44.910152,5.323491
2,1995,1254287966,44.771783,6.960893
3,1995,601115623,46.307671,8.252052
4,1995,7752047,46.268927,11.792439
...,...,...,...,...
95,2025,542214331,45.694910,6.919895
96,2025,1130708403,47.139807,8.667219
97,2025,1353981911,47.470066,11.060259
98,2025,540665061,46.398667,10.488794


### Function definition for converson from daily temperatures to weekly

* Each month gets divided into 4-5 weeks.
* The first week of a month always starts on the first date of the month (contrary to how weeks work in the usual sense). 
* The fifth week of the month either consists of 0, 1, 2 or 3 days, depending on the number of days of the month.
* Leap years are taken into account (i.e all february 29:th days are included)

In [23]:

# Convert the year_daily_ski_weather_dataframe to weekly weather
def daily_to_weekly(yd_df: pd.DataFrame, current_year: int, yw_columns: list):
    months = [1, 2, 3, 11, 12]
    weeks = [1,2,3,4,5]
    yw_df = pd.DataFrame(columns=yw_columns)
    leap_year = False
    
    yd_df = yd_df.sort_values(by=['resort_id', 'date'])
    num_days_in_year = len(yd_df.loc[yd_df['resort_id'] == yd_df["resort_id"].iloc[:1].item()].index)
    
    if num_days_in_year == 151:
        leap_year = False
    elif num_days_in_year == 152:
        leap_year = True
    else:
        raise Exception(f"Error. Winter year consists of {num_days_in_year} days. Must be 151 or 152")
    
    for sr_id_index in range(0, len(yd_df["resort_id"].unique())):
        current_day = 0
        sr_id = (yd_df["resort_id"].unique())[sr_id_index]
        for month in months:
            for week in weeks:
                # capture all days, no matter which month
                if  week < weeks[len(weeks)-1]:
                    days_in_week = range(7*(week-1)+1, 7*(week)+1)
                else:
                    if month == 2:
                        if not leap_year:
                            days_in_week = []
                        else:
                            days_in_week = [29]
                    elif month == 11:
                        days_in_week = [29,30]
                    else:
                        days_in_week = [29,30,31]
                # calculate the mean week temperature and insert it into dataframe   
                week_temps = []
                for day in days_in_week: 
                    today_index = sr_id_index*num_days_in_year+current_day
                    # santiy check. Paranoid code is bad practice though...
                    if not yd_df.date.iloc[today_index].day == date(current_year, month, day).day:
                        raise Exception(f"Error. Something went wrong when calculating dates. Expected {yd_df.date.iloc[today_index].date}, got {date(current_year, month, day)}")
                    if not yd_df.resort_id.iloc[today_index] == sr_id:
                        raise Exception(f"Error. Something went wrong in ski resort index. Expected {yd_df.resort_id.iloc[today_index]}, got {sr_id}")
                    
                    temperature_today = yd_df.temperature_2m_mean.iloc[today_index]
                    end = time.time()
                    
                    if not temperature_today.size == 1:
                        if debug:
                            print(f"The week is: {week}. current date: {current_year}-{month}-{day}")
                            print("Temperature today:", temperature_today)
                        raise Exception("Multiple or no temperature values found for resort_id and date!")
                    week_temps.append(temperature_today.item())
                    current_day += 1
                # edge case for february on non leap years, who won't have a fifth week
                if len(week_temps) == 0:
                    continue
                mean_week_temp = statistics.mean(week_temps)
                week_date = f"{current_year}-{month}-{week}"
                
                elem_data = np.array([[week_date, sr_id, True, mean_week_temp]])
                elem_row = pd.DataFrame(data=elem_data, columns=yw_columns)
                yw_df = pd.concat([yw_df, elem_row])
    return yw_df
        


### Function definition for calling open meteo API and reformat to weekly dates

In [24]:


# get weekly temperatures for the winter season for a specific year for a number of ski resorts
# current_year: the year to get daily temperatures from
# lat_long: a dictionary containing the fields "latitude" and "longitude", which list coordinates for all ski resorts
def get_winter_year_temperatures(lat_long: dict, current_year: int, is_closed_resorts: bool):
    num_requests = 0
    # parameters to send to open-meteo
    params_1 = {
        "latitude": lat_long["latitudes"],
        "longitude": lat_long["longitudes"],
        "start_date": f"{current_year}-01-01",
        "end_date": f"{current_year}-03-31",
        "daily": "temperature_2m_mean",
    }
    params_2 = copy.deepcopy(params_1)
    params_2["start_date"] = f"{current_year}-11-01"
    params_2["end_date"] = f"{current_year}-12-31"
    params = [params_1, params_2]
    y_d_ski_weather_df = pd.DataFrame()

    # loop once for january-march dates, and once for november-december dates
    for i in range(0, len(params)):
        param = params[i]
        # send api requests
        successful_request = False
        num_requests += 1
        if debug:
            print(f"attempting request for param {param}")
        num_unsuccessful_requests = 0
        while(not successful_request):
            try:
                responses = openmeteo.weather_api(meteo_url, params=param)
                successful_request = True
            except:
                num_unsuccessful_requests += 1
                print(f"rate limit exceeded at {num_requests} requests. Sleeping for 60 seconds...")
                time.sleep(60)
                print("Sleep done")
            if num_unsuccessful_requests >= 10:
                print("Too many failed API requests. Aborting")
                raise Exception("Failed to fetch data from OpenMeteo API")
        if debug:
            print("current year:", current_year, ", num responses: ", len(responses))
        # Process
        for resort_index in range(0, len(responses)):
            response = responses[resort_index]
            daily = response.Daily()
            daily_temperature_2m_mean = daily.Variables(0).ValuesAsNumpy()
        
            daily_data = {"date": pd.date_range(
            	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
            	end =  pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
            	freq = pd.Timedelta(seconds = daily.Interval()),
            	inclusive = "left"
            )}
            daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
            daily_data["closed"] = is_closed_resorts
            daily_data["resort_id"] = year_lat_long["ski_resort_ids"][resort_index]
            daily_dataframe = pd.DataFrame(data = daily_data)
            
            y_d_ski_weather_df = pd.concat([y_d_ski_weather_df, daily_dataframe])
            
    y_w_ski_weather_df = daily_to_weekly(y_d_ski_weather_df, current_year, yw_columns)
    
    # set correct types for columns
    y_w_ski_weather_df = y_w_ski_weather_df.astype({'date': 'datetime64[s]', 'ski_resort_id': 'int64', 'closed': 'bool', 'mean_week_temperature': 'float32'})
    y_w_ski_weather_df["closed"] = is_closed_resorts
    
    return y_w_ski_weather_df

### Check and fetch if any years for ski resorts are missing

In [25]:
# create/get feature store
ski_weather_fg = fs.get_or_create_feature_group(
    name='ski_weather',
    description='weekly ski resort weather for both closed and open resorts',
    version=1,
    primary_key=['ski_resort_id', 'closed', 'date'],
    expectation_suite=ski_weather_expectation_suite
)

In [96]:
hw_ski_weather_df = ski_weather_fg.read(dataframe_type="pandas")
hw_ski_weather_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (27.84s) 


Unnamed: 0,date,ski_resort_id,closed,mean_week_temperature
0,1968-12-05 00:00:00+00:00,842,True,-14.206722
1,1978-03-05 00:00:00+00:00,273,True,2.645361
2,1962-03-02 00:00:00+00:00,150,True,-11.163786
3,1974-11-04 00:00:00+00:00,146,True,-0.513095
4,1972-01-04 00:00:00+00:00,273,True,-4.874976
...,...,...,...,...
251140,2008-12-05 00:00:00+00:00,341889677,False,-13.445277
251141,2008-02-04 00:00:00+00:00,665166827,False,1.835917
251142,2008-03-05 00:00:00+00:00,341889677,False,2.288055
251143,2008-01-01 00:00:00+00:00,665124161,False,-7.188928


In [97]:
fetched_year = -1
fetched_resorts = "closed"
found_empty_year = False
closed_open = True

# loop through closed resorts first, then open resorts
for years_lat_long in [closed_years_lat_long, open_years_lat_long]:
    for year in range(min_year, max_year+1):
        # filter year and closed/open resort
        year_elems = hw_ski_weather_df[hw_ski_weather_df['date'].dt.year == year]
        year_elems = year_elems[year_elems['closed'] == closed_open]
        if debug:
            print(f"current year: {year} year_elems current size:", year_elems.size)
        # If data not present for current year on hopsworks and
        # if the year SHOULD exist on hopsworks
        if year_elems.empty and ((years_lat_long["year"] == year).any()):
            # year does not exist on hopsworks and there exists resorts which need historical
            # data from that year. Start fetching data from that year
            fetched_year = year
            
            year_lat_long = years_lat_long.loc[years_lat_long['year']==year]
            if debug:
                print(f"calling get_winter_year_temperatures with year: {fetched_year} for {fetched_resorts} resorts")
            found_empty_year = True
            ski_weather_df = get_winter_year_temperatures(year_lat_long, year, closed_open)
            break
                
    if found_empty_year:
        break
    # change variables for next iteration, where we will check open resorts
    fetched_resorts = "open"
    closed_open = False

print(f"done with fetching weekly temperature data on year {fetched_year} for {fetched_resorts} resorts")

done with fetching weekly temperature data on year 2009 for open resorts


In [98]:
ski_weather_df

Unnamed: 0,date,ski_resort_id,closed,mean_week_temperature
0,2009-01-01,7752047,False,-8.224833
0,2009-01-02,7752047,False,-5.136441
0,2009-01-03,7752047,False,-2.873940
0,2009-01-04,7752047,False,-3.373048
0,2009-01-05,7752047,False,-3.657472
...,...,...,...,...
0,2009-12-01,1456843659,False,-2.857405
0,2009-12-02,1456843659,False,-4.728833
0,2009-12-03,1456843659,False,-11.538059
0,2009-12-04,1456843659,False,-3.607702


In [84]:
ski_weather_df

Unnamed: 0,date,ski_resort_id,closed,mean_week_temperature
0,2007-01-01,7752047,False,-2.371262
0,2007-01-02,7752047,False,1.223679
0,2007-01-03,7752047,False,1.662964
0,2007-01-04,7752047,False,-5.713524
0,2007-01-05,7752047,False,-3.891500
...,...,...,...,...
0,2007-12-01,1456843659,False,-0.781512
0,2007-12-02,1456843659,False,-4.345500
0,2007-12-03,1456843659,False,-9.166036
0,2007-12-04,1456843659,False,-5.651750


## Upload fetched year data to hopsworks

In [99]:
if not fetched_year == -1:
    ski_weather_fg.insert(ski_weather_df)

2026-01-03 15:43:04,939 INFO: 	3 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271967/fs/1258570/fg/1876457


Uploading Dataframe: 100.00% |██| Rows 2400/2400 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: ski_weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271967/jobs/named/ski_weather_1_offline_fg_materialization/executions
