In [15]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../Data/WARP.db')
# check which tables are in the database
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(query, conn)
print(tables)
df = pd.read_sql_query("SELECT * FROM raw_NED_obs_2", conn)
conn.close()

                               name
0                       raw_ned_obs
1                 transform_ned_obs
2                 raw_weather_preds
3                   raw_weather_obs
4             process_weather_preds
5               transform_ned_obs_2
6                     raw_NED_preds
7               processed_NED_preds
8                    raw_entsoe_obs
9              transform_entsoe_obs
10                       raw_ned_df
11                     dim_datetime
12           raw_meteo_forecast_now
13          raw_meteo_preds_history
14                    raw_meteo_obs
15     transform_meteo_forecast_now
16            transform_weather_obs
17  transform_weather_preds_history
18                      master_warp
19                    raw_ned_obs_2


In [16]:
print(df.info())
print(df.head())
print("Unique values in 'type':", df['type'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12764 entries, 0 to 12763
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   @id                  12764 non-null  object
 1   @type                12764 non-null  object
 2   id                   12764 non-null  object
 3   point                12764 non-null  object
 4   type                 12764 non-null  object
 5   granularity          12764 non-null  object
 6   granularitytimezone  12764 non-null  object
 7   activity             12764 non-null  object
 8   classification       12764 non-null  object
 9   capacity             12764 non-null  object
 10  volume               12764 non-null  object
 11  percentage           12764 non-null  object
 12  emission             12764 non-null  object
 13  emissionfactor       12764 non-null  object
 14  validfrom            12764 non-null  object
 15  validto              12764 non-null  object
 16  last

In [17]:
type_counts = df['type'].value_counts()
print(type_counts)

type
/v1/types/1     3191
/v1/types/2     3191
/v1/types/17    3191
/v1/types/20    3191
Name: count, dtype: int64


In [18]:
import pandas as pd

# Load your data (if not already in a DataFrame)
# df = pd.read_csv("your_file.csv")  # or however you're loading it

# Grouped summary per type
summary = df.groupby('type').agg(
    total_rows=('validfrom', 'count'),
    unique_timestamps=('validfrom', pd.Series.nunique),
    columns_count=('validfrom', lambda x: df[df['@type'] == x.name].shape[1])
).reset_index()

print(summary)

# Time range based on validfrom, per 'type'
time_ranges = df.groupby('type').agg(
    min_validfrom=('validfrom', 'min'),
    max_validfrom=('validfrom', 'max')
).reset_index()

print(time_ranges)

           type  total_rows  unique_timestamps  columns_count
0   /v1/types/1        3191               3191             17
1  /v1/types/17        3191               3191             17
2   /v1/types/2        3191               3191             17
3  /v1/types/20        3191               3191             17
           type              min_validfrom              max_validfrom
0   /v1/types/1  2024-12-31T23:00:00+00:00  2025-05-13T21:00:00+00:00
1  /v1/types/17  2024-12-31T23:00:00+00:00  2025-05-13T21:00:00+00:00
2   /v1/types/2  2024-12-31T23:00:00+00:00  2025-05-13T21:00:00+00:00
3  /v1/types/20  2024-12-31T23:00:00+00:00  2025-05-13T21:00:00+00:00


In [19]:

df_NED_obs_processed = df.drop(columns=[
    '@id', '@type', 'id', 'point', 'granularity', 'lastupdate', 'granularitytimezone', 'activity', 'classification', 'capacity','percentage','emission','emissionfactor','validfrom',])
df_NED_obs_processed['type'] = df_NED_obs_processed['type'].str.replace('/v1/types/', 'Type_')

missing_values = df_NED_obs_processed.isnull().sum()
print("Missing values per column:\n", missing_values)
print(df_NED_obs_processed.head())

print("Unique values in 'type':", df_NED_obs_processed['type'].unique())



Missing values per column:
 type       0
volume     0
validto    0
dtype: int64
     type   volume                    validto
0  Type_1  6519520  2025-01-01T00:00:00+00:00
1  Type_1  5917659  2025-01-01T01:00:00+00:00
2  Type_1  4994553  2025-01-01T02:00:00+00:00
3  Type_1  4990896  2025-01-01T03:00:00+00:00
4  Type_1  4988425  2025-01-01T04:00:00+00:00
Unique values in 'type': ['Type_1' 'Type_2' 'Type_17' 'Type_20']


In [20]:
# Pivot the table
df_NED_obs_pivoted = df_NED_obs_processed.pivot_table(
    index=['validto'],  # keep these as index
    columns='type',                   # columns become unique values from 'type'
    values='volume',                  # values to fill in the new columns
    aggfunc='first'                   # if duplicates exist, take the first
)

print("Pivoted DataFrame:\n", df_NED_obs_pivoted.head())
# Reset the index to make 'validto' a column again
df_NED_obs_pivoted.reset_index(inplace=True)


Pivoted DataFrame:
 type                        Type_1  Type_17 Type_2 Type_20
validto                                                   
2025-01-01T00:00:00+00:00  6519520  4158000      0  486250
2025-01-01T01:00:00+00:00  5917659  4158000      0  487000
2025-01-01T02:00:00+00:00  4994553  4158000      0  487000
2025-01-01T03:00:00+00:00  4990896  3930300      0  487000
2025-01-01T04:00:00+00:00  4988425  3474900      0  487000


In [21]:
# Save the processed DataFrame to a new table in the database
conn = sqlite3.connect('../Data/WARP.db')
df_NED_obs_processed.to_sql('transform_ned_obs_2', conn, if_exists='replace', index=False)
conn.close()

In [22]:
# Reopen the database and load the new table as a temporary DataFrame
conn = sqlite3.connect('../Data/WARP.db')
df_temp = pd.read_sql_query("SELECT * FROM transform_ned_obs_2", conn)
conn.close()

# Print feature overview
print(df_temp.info())
print(df_temp.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12764 entries, 0 to 12763
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   type     12764 non-null  object
 1   volume   12764 non-null  object
 2   validto  12764 non-null  object
dtypes: object(3)
memory usage: 299.3+ KB
None
     type   volume                    validto
0  Type_1  6519520  2025-01-01T00:00:00+00:00
1  Type_1  5917659  2025-01-01T01:00:00+00:00
2  Type_1  4994553  2025-01-01T02:00:00+00:00
3  Type_1  4990896  2025-01-01T03:00:00+00:00
4  Type_1  4988425  2025-01-01T04:00:00+00:00
