In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import requests
import warnings

df = pd.read_csv('sensor_quality_2015-2024.csv')
year = '2015'
start = f'{year}-01-01 00:14:00'
end = f'{year}-12-31 23:59:00'

In [2]:
# Generate a DataFrame with a fixed set of timestamps for every 15 minutes of y, considering UK DST adjustments
def generate_fixed_timestamps(start, end):
    # Create range for the whole year with 15-minute intervals
    dt_range = pd.date_range(start=start, end=end, freq='15min')

    fixed_df = pd.DataFrame({'Timestamp': dt_range})
    return fixed_df

In [3]:
# Function to fetch data for a single site
def fetch_data_for_site(site_id):
    page = 1
    while True:
        url = f"{base_url}?sites={site_id}&page={page}&page_size=5000"
        response = requests.get(url)
        if response.status_code != 200:
            if page > 1:
                break
            else:
                print(f"Data download stopped at page {page-1} for site {site_id}")
                break
        
        data = response.json()
        if not data['Rows']:
            break
        
        result = pd.DataFrame(data['Rows'])

        yield result
        page += 1


In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import logging

# Configure logging
log_file = f'{year}_data_processing.log'

logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Base URL for API
base_url = f'https://webtris.nationalhighways.co.uk/api/v1.0/reports/{start[8:10]+start[5:7]+start[0:4]}/to/{end[8:10]+end[5:7]+end[0:4]}/daily'

# Create the fixed timestamp DataFrame
fixed_timestamps_df = generate_fixed_timestamps(start, end)
t = len(fixed_timestamps_df)

# Directory for saving CSV files
output_dir = f'./{year}/'
os.makedirs(output_dir, exist_ok=True)

# Filter the list of valid sensors based on data quality
quality_col = f'{year}_data_quality'
valid_sensors = df[df[quality_col] > 0]['Id'].tolist()
logging.info(f"Filtered {len(valid_sensors)} valid sensors based on quality > 0.")

In [None]:
# Loop through valid sensor IDs and fetch data
for site_id in tqdm(valid_sensors, desc="Processing Sensors"):
    site_id = str(site_id)

    # Check if CSV already exists
    csv_path = os.path.join(output_dir, f'{site_id}.csv')
    if os.path.exists(csv_path):
        logging.info(f'{csv_path} already exists. Skipping.')
        continue

    site_data_frames = []
    for df_page in fetch_data_for_site(site_id):
        df_page = df_page[['Site Name', 'Report Date', 'Time Period Ending', "0 - 520 cm", "521 - 660 cm", "661 - 1160 cm", "1160+ cm", 'Total Volume', 'Avg mph']]
        site_data_frames.append(df_page)

    if site_data_frames:  # Only process if we have data
        df = pd.concat(site_data_frames, ignore_index=True)
        df['Avg mph'] = pd.to_numeric(df['Avg mph'], errors='coerce')
        df['0 - 520 cm'] = pd.to_numeric(df['0 - 520 cm'], errors='coerce')        
        df['521 - 660 cm'] = pd.to_numeric(df['521 - 660 cm'], errors='coerce')
        df['661 - 1160 cm'] = pd.to_numeric(df['661 - 1160 cm'], errors='coerce')
        df['1160+ cm'] = pd.to_numeric(df['1160+ cm'], errors='coerce')
        df['Total Volume'] = pd.to_numeric(df['Total Volume'], errors='coerce')
        
        # Combine 'Report Date' and 'Time Period Ending' into a full datetime
        df['Timestamp'] = pd.to_datetime(df['Report Date'].str.slice(0, 10) + ' ' + df['Time Period Ending'].str.slice(0, 5))

        # Handle duplicates
        duplicate_rows = df.duplicated(subset='Timestamp', keep=False)
        duplicate_df = df[duplicate_rows]
        
        if not duplicate_df.empty:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                averaged_df = duplicate_df.groupby('Timestamp').agg({
                    '0 - 520 cm': lambda x: np.nanmean(x.astype(float)),
                    '521 - 660 cm': lambda x: np.nanmean(x.astype(float)),
                    '661 - 1160 cm': lambda x: np.nanmean(x.astype(float)),
                    '1160+ cm': lambda x: np.nanmean(x.astype(float)),
                    'Total Volume': lambda x: np.nanmean(x.astype(float)),
                    'Avg mph': lambda x: np.nanmean(x.astype(float)),
                    'Site Name': 'first',  # Keep the first site name
                    'Report Date': 'first',  # Keep the first report date
                    'Time Period Ending': 'first'  # Keep the first time period ending
                }).reset_index()
            
            df = df[~duplicate_rows]
            df = pd.concat([df, averaged_df], ignore_index=True)
        
        # Sort by timestamp and merge with fixed timestamps
        df.sort_values(by='Timestamp', inplace=True)
        merged_df = pd.merge(fixed_timestamps_df, df, on='Timestamp', how='left')
        merged_df.drop(columns=['Report Date', 'Time Period Ending', 'Site Name'], inplace=True)

        # clean: Impute Avg mph = 0 (when Total Volume = 0) with the closest previous non-zero Avg mph reading
        prev_nonzero_speed = np.nan  # Initialize with np.nan
        for idx, row in merged_df.iterrows():
            if row['Total Volume'] == 0:  # If Total Volume is 0
                if not np.isnan(prev_nonzero_speed):
                    merged_df.at[idx, 'Avg mph'] = prev_nonzero_speed
            else:
                # Update the previous non-zero speed only if 'Avg mph' is not NaN
                if not pd.isna(row['Avg mph']):
                    prev_nonzero_speed = row['Avg mph']

        # Write to CSV
        merged_df.to_csv(csv_path, index=False)
        logging.info(f'{csv_path} has been saved.')
    else:
        logging.info(f'Sensor {site_id} has no valid data or is inactive.')

logging.info("Data processing completed.")