Code to collect NED predictions, (previously) fetched by remote thinclient, plus code to run actual 7 days ahead predictions for energy productions.

In [4]:
import pandas as pd

# Read the CSV files
df_NED_preds_CSV_3 = pd.read_csv('../data/ned-api/appended_NED_preds-5May.csv')
df_NED_preds_CSV_2 = pd.read_csv('../data/ned-api/appended_NED_preds-1May2025.csv')
df_NED_preds_CSV_1 = pd.read_csv('../data/ned-api/appended_NED_preds_2-10-April.csv')


# Convert current_datetime to datetime type if it's not already
df_NED_preds_CSV_3['current_datetime'] = pd.to_datetime(df_NED_preds_CSV_3['current_datetime'])
df_NED_preds_CSV_2['current_datetime'] = pd.to_datetime(df_NED_preds_CSV_2['current_datetime'])
df_NED_preds_CSV_1['current_datetime'] = pd.to_datetime(df_NED_preds_CSV_1['current_datetime'])



In [11]:
# Analyze date ranges for each DataFrame


print("\nDate ranges in df_NED_preds_CSV_1:")
print(f"Start: {df_NED_preds_CSV_1['current_datetime'].min()}")
print(f"End: {df_NED_preds_CSV_1['current_datetime'].max()}")

print("\nDate ranges in df_NED_preds_CSV_2:")
print(f"Start: {df_NED_preds_CSV_2['current_datetime'].min()}")
print(f"End: {df_NED_preds_CSV_2['current_datetime'].max()}")

print("\nDate ranges in df_NED_preds_CSV_3:")
print(f"Start: {df_NED_preds_CSV_3['current_datetime'].min()}")
print(f"End: {df_NED_preds_CSV_3['current_datetime'].max()}")

print("\nNumber of records in each DataFrame:")
print(f"df_NED_preds_CSV_1: {len(df_NED_preds_CSV_1):,} records")
print(f"df_NED_preds_CSV_2: {len(df_NED_preds_CSV_2):,} records")
print(f"df_NED_preds_CSV_3: {len(df_NED_preds_CSV_3):,} records")


Date ranges in df_NED_preds_CSV_1:
Start: 2025-04-02 08:17:40
End: 2025-04-10 12:00:05

Date ranges in df_NED_preds_CSV_2:
Start: 2025-04-10 14:07:18
End: 2025-05-01 00:00:05

Date ranges in df_NED_preds_CSV_3:
Start: 2025-05-01 12:00:05
End: 2025-05-05 12:00:05

Number of records in each DataFrame:
df_NED_preds_CSV_1: 68,646 records
df_NED_preds_CSV_2: 50,852 records
df_NED_preds_CSV_3: 11,004 records


In [21]:
# Concatenate all dataframes
df_NED_preds_CSV = pd.concat([df_NED_preds_CSV_1, df_NED_preds_CSV_2, df_NED_preds_CSV_3], ignore_index=True)

print("\nNumber of records in each DataFrame:")
print(f"df_NED_preds_CSV_1: {len(df_NED_preds_CSV):,} records")

# Check for duplicates before dropping
duplicate_count = df_NED_preds_CSV.duplicated(subset=['current_datetime','type','validto']).sum()
print(f"Found {duplicate_count} duplicate records")

# Drop duplicates
df_NED_preds_CSV = df_NED_preds_CSV.drop_duplicates(subset=['current_datetime','type','validto'])
print(f"After dropping duplicates, {len(df_NED_preds_CSV):,} records remain")





Number of records in each DataFrame:
df_NED_preds_CSV_1: 130,502 records
Found 0 duplicate records
After dropping duplicates, 130,502 records remain


In [25]:
# Check for missing values
missing_values = df_NED_preds_CSV.isnull().sum()
print("\nMissing values in each column:")
print(missing_values[missing_values > 0])
# Check for any rows with missing values
missing_rows = df_NED_preds_CSV[df_NED_preds_CSV.isnull().any(axis=1)]
if not missing_rows.empty:
    print(f"\nFound {len(missing_rows)} rows with missing values:")
    print(missing_rows)
else:
    print("\nNo rows with missing values found.")



Missing values in each column:
Series([], dtype: int64)

No rows with missing values found.


In [None]:
# Display the date range
print("Date range in the dataset:")
print(f"Start date: {df_NED_preds_CSV['current_datetime'].min()}")
print(f"End date: {df_NED_preds_CSV['current_datetime'].max()}")
print(f"\nTotal number of records: {len(df_NED_preds_CSV)}")

# Display info about the types and counts of values
print("\nDataframe Info:")
df_NED_preds_CSV.info()

# Display numeric column statistics
print("\nNumeric Column Statistics:")
print(df_NED_preds_CSV.describe())

# Show unique types and their counts
print("\nDistribution of types:")
print(df_NED_preds_CSV['type'].value_counts())

Date range in the dataset:
Start date: 2025-04-02 08:17:40
End date: 2025-05-05 12:00:05

Total number of records: 130502

Number of records in each DataFrame:
df_NED_preds_CSV: 130,502 records

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130502 entries, 0 to 130501
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   @id                  130502 non-null  object        
 1   @type                130502 non-null  object        
 2   id                   130502 non-null  int64         
 3   point                130502 non-null  object        
 4   type                 130502 non-null  object        
 5   granularity          130502 non-null  object        
 6   granularitytimezone  130502 non-null  object        
 7   activity             130502 non-null  object        
 8   classification       130502 non-null  object        
 9   capacity             130502 non-nul

In [28]:
import sqlite3

# Connect to the SQLite database
db_path = '../data/WARP.db'
conn = sqlite3.connect(db_path)

# Write the DataFrame to the database table 'raw_NED_preds'
# If table exists, replace it. If not, create new table
df_NED_preds_CSV.to_sql('raw_NED_preds', conn, if_exists='replace', index=False)


record_count = df_NED_preds_CSV.shape[0]
print(f"{record_count} records successfully written to database table 'raw_NED_preds'")
# Close the connection
conn.close()

130502 records successfully written to database table 'raw_NED_preds'


In [30]:
# connecting to the SQLite database and checking the most recent date in the raw_NED_preds table

import sqlite3
import pandas as pd


# Connect to the SQLite database
db_path = '../data/WARP.db'
conn = sqlite3.connect(db_path)


# collect the date column from the raw_NED_preds table
NED_pred_dates = pd.read_sql_query("SELECT current_datetime FROM raw_NED_preds", conn)
conn.close()

# Convert to datetime and find max date
NED_pred_dates['current_datetime'] = pd.to_datetime(NED_pred_dates['current_datetime'])
most_recent_NED_pred_date = NED_pred_dates['current_datetime'].max().strftime('%Y-%m-%d')
print(f"The most recent date in raw_NED_preds is: {most_recent_NED_pred_date}")



The most recent date in raw_NED_preds is: 2025-05-05
