In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [16]:
def transform(df):
    df.drop(columns=['preciptype', 'severerisk', 'sunrise', 'sunset', 'moonphase', 'description', 'icon', 'stations'], inplace=True)
    df.loc[df["windgust"].isna(),'windgust'] = df['windspeed'] # if indgust is missing, insert windspeed
    df.fillna({'snow': 0,'snowdepth':0, 'pressure':np.NaN}, inplace = True)
    if "sealevelpressure" or "datetime" in df.columns:
        df.rename(columns={"sealevelpressure":"pressure", "datetime":"date"}, inplace=True)
    df.drop(columns=['name'], inplace=True)
    return df

def assemble(path_to_file):
    df = pd.read_csv(path_to_file, delimiter=',')
    new_df = transform(df)
    return new_df

In [17]:
final_df = assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2013_2014.csv")

In [18]:
final_df = pd.concat([final_df,
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2014_2016.csv"),
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2016_2018.csv"),
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2018_2020.csv"),
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2020_2022.csv"),
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2022_2022.csv"),
                    assemble("C://Users//Admin//Desktop//github//ID2223//project//vienna_2022_2023.csv")])

In [21]:
final_df.to_csv("C://Users//Admin//Desktop//github//ID2223//project//final_df.csv")

In [None]:
import hopsworks
import os

os.environ['CONDA_DLL_SEARCH_MODIFICATION_ENABLE'] = '1'
client= hopsworks.login()
stream = client.get_feature_store()

weather_dt = stream.get_or_create_feature_group(
    name = 'weather_fg',
    description = 'Weather characteristics of each day',
    version=6,
    primary_key=['date'],
    online_enabled=True
)
weather_dt.insert(final_df)

## Historical weather data EDA summary

- with my current subscription plan, I can only get 1000 days worth of data in every 24 hours
- 33 different parameters
- I dropped these variables: preciptype, severerisk, sunrise, sunset, moonphase, description, icon, stations.
- no missing dates, but lots of missing values consistently in windgust
- the way to deal with windgust: if missing, set it to the windspeed
- --------||---------- snow and snowdepth: set missing to zero

- 2020-2022: 1 nan in pressure
- 2014-2016: all snow missing, snowdepth 30 not nan
- 2013-2014: all snow missing, snowdepth 6 not nan
