In [13]:
import pandas as pd
import hopsworks
import datetime

In [14]:
files = [
    'Toronto-Daily-traffic-data-2022-02-02-to-2022-12-31-EN.csv',
    'Toronto-Daily-traffic-data-2023-01-01-to-2023-12-31-EN.csv',
    'Toronto-Daily-traffic-data-2024-01-01-to-2024-12-31-EN.csv',
    'Toronto-Daily-traffic-data-2025-01-01-to-2025-10-31-EN.csv'
]

# Read and combine all CSV files into one list of DataFrames
dfs = [pd.read_csv(file) for file in files]

# Concatenate all DataFrames into a single one
df = pd.concat(dfs, ignore_index=True)

# Convert to datetime objects to ensure correct chronological
df['Traffic day'] = pd.to_datetime(df['Traffic day'])

# Sort the DataFrame by the date
df = df.sort_values(by='Traffic day').reset_index(drop=True)

print(df)


     Traffic day  Traffic count
0     2022-02-02            NaN
1     2022-02-03        38188.0
2     2022-02-04        54432.0
3     2022-02-05        27499.0
4     2022-02-06        28120.0
...          ...            ...
1363  2025-10-27        31315.0
1364  2025-10-28        53543.0
1365  2025-10-29        75788.0
1366  2025-10-30        98024.0
1367  2025-10-31       120728.0

[1368 rows x 2 columns]


In [15]:
df['Traffic count'] = df['Traffic count'].astype('float32')

In [16]:
# Drop any rows with missing data
df.dropna(inplace=True)
df

Unnamed: 0,Traffic day,Traffic count
1,2022-02-03,38188.0
2,2022-02-04,54432.0
3,2022-02-05,27499.0
4,2022-02-06,28120.0
5,2022-02-07,49245.0
...,...,...
1363,2025-10-27,31315.0
1364,2025-10-28,53543.0
1365,2025-10-29,75788.0
1366,2025-10-30,98024.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1220 entries, 1 to 1367
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Traffic day    1220 non-null   datetime64[ns]
 1   Traffic count  1220 non-null   float32       
dtypes: datetime64[ns](1), float32(1)
memory usage: 23.8 KB


In [33]:
project = hopsworks.login()






Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1272004


In [34]:
fs = project.get_feature_store()

df = df.rename(columns={'Traffic day': 'traffic_day', 'Traffic count': 'traffic_count'})

traffic_flow_fg = fs.get_or_create_feature_group(
    name='traffic_flow_fg',
    description = 'Traffic flow in Toronto each day',
    version=1,
    event_time = 'traffic_day',
    primary_key = ['traffic_day']
)

In [35]:
traffic_flow_fg.insert(df)

Uploading Dataframe: 100.00% |██████████| Rows 1220/1220 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: traffic_flow_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1272004/jobs/named/traffic_flow_fg_1_offline_fg_materialization/executions


(Job('traffic_flow_fg_1_offline_fg_materialization', 'SPARK'), None)