In [1]:
!pip3 install -U sagemaker
!pip3 install polars

Collecting sagemaker
  Using cached sagemaker-2.199.0-py2.py3-none-any.whl
Collecting boto3<2.0,>=1.33.3 (from sagemaker)
  Downloading boto3-1.33.7-py3-none-any.whl.metadata (6.7 kB)
Collecting uvicorn==0.22.0 (from sagemaker)
  Using cached uvicorn-0.22.0-py3-none-any.whl (58 kB)
Collecting fastapi==0.95.2 (from sagemaker)
  Using cached fastapi-0.95.2-py3-none-any.whl.metadata (24 kB)
Collecting docker (from sagemaker)
  Using cached docker-6.1.3-py3-none-any.whl.metadata (3.5 kB)
Collecting starlette<0.28.0,>=0.27.0 (from fastapi==0.95.2->sagemaker)
  Using cached starlette-0.27.0-py3-none-any.whl.metadata (5.8 kB)
Collecting h11>=0.8 (from uvicorn==0.22.0->sagemaker)
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Collecting botocore<1.34.0,>=1.33.7 (from boto3<2.0,>=1.33.3->sagemaker)
  Downloading botocore-1.33.7-py3-none-any.whl.metadata (6.1 kB)
Collecting s3transfer<0.9.0,>=0.8.2 (from boto3<2.0,>=1.33.3->sagemaker)
  Using cached s3transfer-0.8.2-py3-none-any.whl.metadata

In [3]:
import xgboost as xgb
import pandas as pd
import sys
import os

In [4]:
model = xgb.XGBRegressor(verbosity=0, silent=True)
model.load_model(os.path.join("models", "model_xgboost.json"))

In [5]:
stations = pd.read_csv(os.path.join("staging_data", "station.csv")).rename(
    columns={"id": "stationid", "name": "station_name"}
)

locations = pd.read_csv(os.path.join("staging_data", "location.csv")).rename(
    columns={"id": "locationid", "name": "location_name"}
)

locationcategories = pd.read_csv(
    os.path.join("staging_data", "locationcategory.csv")
).rename(columns={"id": "locationcategoryid", "name": "locationcategory_name"})

station_relations = pd.read_csv(os.path.join("staging_data", "stationrelation.csv"))

station_relations = (
    station_relations.merge(stations, on="stationid", how="inner")
    .merge(locations, on="locationid", how="inner")
    .merge(locationcategories, on="locationcategoryid", how="inner")
)

In [7]:
def predict(locationcategory_name, location_name, start_date, duration, station_relations):
    locationcategory_name = locationcategory_name.lower()
    location_name = location_name.lower()
    start_date = pd.to_datetime(start_date)
    duration = int(duration)
    date_index = pd.date_range(start_date, start_date + pd.Timedelta(duration, "d"), freq="d")
    filtered_stations = station_relations.loc[(station_relations["locationcategory_name"].str.lower().str.contains(locationcategory_name)) &
                                                (station_relations["location_name"].str.lower().str.contains(location_name)),
                                                ["stationid", "latitude", "longitude", "elevation", "location_name"]].drop_duplicates()
    dates = pd.DataFrame(
        [
            {
                "dateid": int(
                    f"{date_index[i].year}{str(date_index[i].month).zfill(2)}{str(date_index[i].day).zfill(2)}"
                ),
                "date": date_index[i].__str__().split(" ")[0],
                "year": date_index[i].year - 2010,
                "quarter": date_index[i].quarter,
                "month": date_index[i].month,
                "week": date_index[i].week,
                "day_of_year": date_index[i].day_of_year,
                "is_leap_year": int(date_index[i].is_leap_year),
            }
            for i in range(len(date_index))
        ]
    )
    ref = dates.join(filtered_stations, how="cross")
    X_test = ref.drop(columns=["dateid", "date", "stationid", "location_name"])
    Y_pred = pd.DataFrame((model.predict(X_test) / 10), columns=["pred_tmax", "pred_tmin", "pred_prcp", "pred_snow", "pred_snwd"])
    ref = ref.drop(columns=["dateid", "year", "quarter", "month", "week", "day_of_year", "is_leap_year", "stationid", "latitude", "longitude", "elevation"])
    out = pd.concat([ref,Y_pred], axis=1, ignore_index=True)
    out.columns = ["Date", "Location", "TMAX (C)", "TMIN (C)", "PRCP (cm)", "SNOW (cm)", "SNWD (cm)"]
    return out.groupby(["Location", "Date"]).mean().reset_index()

In [16]:
locationcategory_name = "CITY"
location_name = "San Francisco"
start_date = "2023-12-30"
duration = "7"
predict(locationcategory_name, location_name, start_date, duration, station_relations)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,Location,Date,TMAX (C),TMIN (C),PRCP (cm),SNOW (cm),SNWD (cm)
0,"San Francisco, CA US",2023-12-30,7.196602,4.007489,16.250647,0.009818,-0.681265
1,"San Francisco, CA US",2023-12-31,7.196602,4.007489,16.250647,0.009818,-0.681265
2,"San Francisco, CA US",2024-01-01,8.431298,2.279749,3.353752,-0.052752,1.54534
3,"San Francisco, CA US",2024-01-02,8.431298,2.279749,3.353752,-0.052752,1.54534
4,"San Francisco, CA US",2024-01-03,8.431298,2.279749,4.14371,-0.009639,1.54534
5,"San Francisco, CA US",2024-01-04,9.103164,3.13793,3.75198,-0.00376,1.54534
6,"San Francisco, CA US",2024-01-05,9.103164,3.13793,3.75198,-0.00376,1.54534
7,"San Francisco, CA US",2024-01-06,9.103164,3.13793,3.75198,0.386499,1.460063
