### Install package requirements and import dependencies

In [1]:
!pip install -r requirements.txt --quiet


from dotenv import load_dotenv
import pandas as pd
import requests_cache
import subprocess
from retry_requests import retry
from io import StringIO
import hopsworks
import great_expectations as ge
from datetime import date
import json


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Load Environment variables from the .env file

In [2]:
load_dotenv()

True

## Download&format ski resort data

### Fetch former ski resorts data

In [8]:
# create supabase credential to authenticate towards endpoint
result = subprocess.run(
    ["curl", "https://abandonedskitowns.com/get_key.php"],    
    capture_output=True,
    text=True,
    check=True
)
api_key = result.stdout

# query the supabase instance for all ski resorts
command = [
    "curl",
    "https://uffrhqrrlipovcnrmgcz.supabase.co/rest/v1/main?select=*",
    "-H",
    f"apikey:{api_key}"
]

result = subprocess.run(
    command,
    capture_output=True,
    text=True,
    check=True
)

# save the output json
closed_resorts_string = result.stdout

### Format&filter former ski resorts data

In [9]:
# convert closed resorts JSON to pandas object
df_cr = pd.read_json(StringIO(closed_resorts_string))

# filter out all resorts that don't have a closing date
df_cr = df_cr[~df_cr["year_closed"].isna()]
df_cr = df_cr[~df_cr["year_closed"].str.contains("Unknown")]

# filter out all resorts that specify decade instead of exact year
df_cr = df_cr[~df_cr["year_closed"].str.contains("s")]

# convert closed year to int
df_cr['year_closed'] = pd.to_numeric(df_cr['year_closed'], downcast='integer', errors='coerce')

# filter out all resorts which are not in Europe or North America
df_cr = df_cr[(df_cr["area"] == "Europe") | (df_cr["area"] == "North America")]

# filter out all columns except id, name, closing year, latitude, longitude
df_cr = df_cr.filter(items=['id', 'name', 'year_closed', 'latitude', 'longitude'])

In [5]:
df_cr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 241 entries, 0 to 387
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           241 non-null    int64  
 1   name         241 non-null    object 
 2   year_closed  241 non-null    int16  
 3   latitude     241 non-null    float64
 4   longitude    241 non-null    float64
dtypes: float64(2), int16(1), int64(1), object(1)
memory usage: 9.9+ KB


### Fetch current ski resorts data

In [14]:
# query the Overpass API for all winter sports areas
result = subprocess.run(
    [
        "curl", 
        "-X", 
        "POST", 
        "https://overpass-api.de/api/interpreter", 
        "--data", 
        '[out:json][timeout:1000];area(id:3602698607)->.searchArea;(way["landuse"="winter_sports"](area.searchArea);relation["landuse"="winter_sports"](area.searchArea););out geom;'
    ],    
    capture_output=True,
    text=True,
    check=True
)

open_resorts_string = result.stdout

### Format&filter current ski resorts data

In [20]:
# need to convert to dicitionary so that pandas doesn't get confused by the metadata
open_resorts_dict = json.loads(open_resorts_string)
ids = []
lats = []
lons = []
names = []

# extract relevant data from the response
for elem in open_resorts_dict["elements"]:
    # filter out all datapoints that do not have a name
    if "name" in elem["tags"]:
        names.append(elem["tags"]["name"])    
        ids.append(elem["id"])
        med_lat = (elem["bounds"]["minlat"] + elem["bounds"]["maxlat"]) / 2
        med_lon = (elem["bounds"]["minlon"] + elem["bounds"]["maxlon"]) / 2
        lats.append(med_lat)
        lons.append(med_lon)
        
# input relevant data to a new list of dicts
rel_data = []
for i in range(0, len(ids)):
    obj = {"id": ids[i], "name": names[i], "latitude": lats[i], "longitude": lons[i]}
    rel_data.append(obj)

# convert open resorts JSON to pandas object
df_or = pd.DataFrame.from_dict(rel_data)

print(df_or)



           id                            name   latitude  longitude
0    23079840                      Isola 2000  44.187091   7.147525
1    25094717   Orlen Arena Oberstdorf Allgäu  47.405426  10.293518
2    29188395               Katschberg-Aineck  47.064452  13.643185
3    29463355                        Nassfeld  46.584380  13.270765
4    32150269             Sella Nevea - Kanin  46.362536  13.498806
..        ...                             ...        ...        ...
834  17261990  Espace Haute Maurienne Vanoise  45.255045   6.853167
835  17273806       Domaine Autrans - Méaudre  45.178097   5.543480
836  19357160                          Arolla  46.025974   7.462880
837  19433085                          Colere  45.975059  10.056069
838  19457834                Portes du Soleil  46.221523   6.716058

[839 rows x 4 columns]


## Hopsworks validation&upload

### Define data validation rules

#### both open and closed resorts
Year should not be lower than 1900 and not higher than the current year

#### closed resorts
Latitude should not be lower than 0 and not higher than 90 (alpine and north american region)
Longitude should not be lower than -180 and not higher than 180 (alpine and north american region)

#### open resorts
Latitude should not be lower than 30 and not higher than 40 (alpine region)
Longitude should not be lower than 0 and not higher than 20 (alpine region)


In [29]:
year_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"year_closed",
        "min_value":1900,
        "max_value":date.today().year
    }
)

latitude_closed_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"latitude",
        "min_value":0,
        "max_value":90,
    }
)

longitude_closed_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"longitude",
        "min_value":-180,
        "max_value":180,
    }
)

latitude_open_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"latitude",
        "min_value":40,
        "max_value":50,
    }
)

longitude_open_expectation = ge.core.ExpectationConfiguration(
    expectation_type="expect_column_min_to_be_between",
    kwargs={
        "column":"longitude",
        "min_value":0,
        "max_value":20,
    }
)

In [37]:
closed_resort_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="closed_resort_expectation_suite"
)
closed_resort_expectation_suite.add_expectation(year_expectation)
closed_resort_expectation_suite.add_expectation(latitude_closed_expectation)
closed_resort_expectation_suite.add_expectation(longitude_closed_expectation)
    
open_resort_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="open_resort_expectation_suite"
)
open_resort_expectation_suite.add_expectation(latitude_open_expectation)
open_resort_expectation_suite.add_expectation(longitude_open_expectation)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "longitude", "min_value": 0, "max_value": 20}, "meta": {}}

### Log in to hopsworks

In [23]:
project = hopsworks.login()

2025-12-26 15:02:41,439 INFO: Initializing external client
2025-12-26 15:02:41,441 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-12-26 15:02:44,502 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271967


In [24]:
fs = project.get_feature_store()

### Send data to hopsworks feature store

#### Closed resorts

In [41]:
# create/get feature store
closed_resorts_fg = fs.get_or_create_feature_group(
    name='former_resorts',
    description='Ski resorts which have closed down for buisness',
    version=1,
    primary_key=['id'],
    expectation_suite=closed_resort_expectation_suite
)

In [42]:
# Insert Dataframe into feature group
closed_resorts_fg.insert(df_cr)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1271967/fs/1258570/fg/1878441
2025-12-26 15:17:15,774 INFO: 	3 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271967/fs/1258570/fg/1878441


Uploading Dataframe: 100.00% |████| Rows 241/241 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: former_resorts_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271967/jobs/named/former_resorts_1_offline_fg_materialization/executions


(Job('former_resorts_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "latitude",
           "min_value": 0,
           "max_value": 90
         },
         "meta": {
           "expectationId": 800781
         }
       },
       "result": {
         "observed_value": 32.7494,
         "element_count": 241,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-12-26T02:17:15.000774Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwar

#### Open resorts

In [31]:
# create/get feature store
open_resorts_fg = fs.get_or_create_feature_group(
    name='current_resorts',
    description='Ski resorts which are still in business',
    version=1,
    primary_key=['id'],
    expectation_suite=open_resort_expectation_suite
)

In [32]:
# Insert Dataframe into feature group
open_resorts_fg.insert(df_or)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1271967/fs/1258570/fg/1876393
2025-12-26 15:06:36,800 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271967/fs/1258570/fg/1876393


Uploading Dataframe: 100.00% |████| Rows 839/839 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: current_resorts_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271967/jobs/named/current_resorts_1_offline_fg_materialization/executions


(Job('current_resorts_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "longitude",
           "min_value": 0,
           "max_value": 20
         },
         "meta": {
           "expectationId": 799748
         }
       },
       "result": {
         "observed_value": 5.267084499999999,
         "element_count": 839,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-12-26T02:06:36.000800Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
  