# Device id(s) configuration.

There are 3 options to configure devices for analysis.
1. Single device id
2. A list of device ids
3. All devices associated with the API key configured in the .env file

In [1]:
# If you want to analyse a single device, replace None with the device id (in quotes), e.g. "DD1234567890"
DEVICE_ID = None 

# If you want to analyse a list of devices, place a csv file with the device id list (e.g. as exported from Django) in the `devices_collections` folder
# and enter the filename here (in quotes), e.g. "project_x_devices.csv" 
# (make sure to set DEVICE_ID to None as it takes precedence)
DEVICE_IDS_FILENAME = "device_ids-20250225-062420.csv" 

# If you want to analyse all devices associated with a device group, configure the API key of the device group in the .env file
# and set the values of DEVICE_ID and DEVICE_IDS_FILENAME above to None

# Period configuration

Configure the time period to analyse.


In [2]:
TIMEZONE = "Australia/Sydney"         # The timezone the period is defined in
START_DATE = "2025-02-20"             # Date string in the format <YYYY-MM-DD> in the target timezone
END_DATE = "2025-02-25"               # Date string in the format <YYYY-MM-DD> in the target timezone

# Threshold configuration

Configure the data completeness threshold under which a device is considered problematic

In [3]:
DATA_COMPLETENESS_THRESHOLD = 99

In [4]:
import os
import logging
import re

import pandas as pd
import pendulum
import plotly.graph_objects as go
from itables import show
from itables import JavascriptFunction, JavascriptCode


from config import AppConfig
from api_clients import PublicApiClient
from logger import get_logger

In [5]:
config = AppConfig(os.environ)
logger: logging.Logger = get_logger(config)
public_api_client = PublicApiClient(config.ENVIRONMENT, config.PUBLIC_API_KEY, config.PUBLIC_API_MAX_TPS, logger)

# Data downloading

In [None]:
def is_valid_device_id(device_id: str, raise_error: bool = False) -> tuple[bool, str]:
  """
  Returns True if the device ID is valid, False if not.
  Simplified version based on lib_common
  """
  DEVICE_ID_PATTERN = "^[B-F]{1}[A-F0-9]{12}$"
  DEVICE_ID_REGEX = re.compile(DEVICE_ID_PATTERN)
  # re.match won't detect trailing space in the device id, but re.fullmatch will.
  if not DEVICE_ID_REGEX.fullmatch(device_id):
      return False
  return True

# Determine devices to analyse
devices: list[str] = []
if DEVICE_ID is not None:
  devices = [DEVICE_ID]
elif DEVICE_IDS_FILENAME is not None:
  # TODO: add checks for presence of file and data in the file
  df = pd.read_csv(f'./devices_collections/{DEVICE_IDS_FILENAME}', header=None, names=['device_id'])
  devices= df['device_id'].tolist()
else:
  # get all devices associated with API key
  result, error = public_api_client.get_devices_list()
  if error is not None:
    logger.error(f'failed to load devices for API key: {error}')
  else:
    devices = result

# filter out any invalid device ids
devices =[d for d in devices if is_valid_device_id(d)]


num_devices = len(devices)
logger.info(f'found {num_devices} devices to analyse')

In [7]:
# Determine timestamps for requests
time_start = pendulum.parse(START_DATE, tz=TIMEZONE)
timestamp_start = time_start.int_timestamp

time_end = pendulum.parse(END_DATE, tz=TIMEZONE)
timestamp_end = time_end.int_timestamp

In [8]:
# TODO: load device status for each device to exclude decommissioned devices
# Issue: only user-apps-api exposes this and we can't use that because of different auth system

In [None]:
# Download LE data
le_data = {}
for index, device_id in enumerate(devices):
  logger.info(f'Downloading LE data for device {index+1}/{num_devices} - {device_id}')
  result, error = public_api_client.load_long_energy(device_id, timestamp_start, timestamp_end)
  if error is not None:
    logger.error(f'failed to load LE for device: {device_id}: {error}')
  else:
    le_data[device_id] = result

logger.info(f'Successfully downloaded LE data for {len(le_data)}/{num_devices} devices')

# Analysis


In [10]:
# Devices we couldn't download LE data for (included in devices list but not in LE dict)
devices_with_le_data = list(le_data.keys())
devices_without_le_data = [device_id for device_id in devices if device_id not in devices_with_le_data]

devices_with_empty_le_data = [device_id for device_id, data in le_data.items() if len(data) == 0]

# Determine expected number of intervals
# Calculate the number of 5-minute intervals between the start and end timestamps using pendulum
num_intervals_expected = int((time_end.diff(time_start).in_minutes()) // 5)

# Devices with missing LE data
# TODO: add alternative analysis based on timestamp and duration of interval (only works for intervals between existing intervals, i.e. need to handle missing intervals at start or end of period separately)
# Could also just do a quick analysis to verify all intervals have a duration of 300s.
devices_with_missing_le_data = {device_id: data for device_id, data in le_data.items() if len(data) < num_intervals_expected}

# Devices not meeting data completeness threshold
num_intervals_completeness_threshold = DATA_COMPLETENESS_THRESHOLD * num_intervals_expected // 100 # TODO: double check if this can result in off-by-one error
devices_not_meeting_threshold = {device_id: data for device_id, data in le_data.items() if len(data) < num_intervals_completeness_threshold}

# Devices with complete LE data
devices_with_complete_le_data = {device_id: data for device_id, data in le_data.items() if len(data) == num_intervals_expected}


In [11]:
# High level analysis

le_data_df = pd.DataFrame({
    'device_id': list(le_data.keys()),
    'le_intervals': list(le_data.values()),
    'num_intervals': [len(data) for data in le_data.values()]
})

le_data_df['num_intervals_expected'] = num_intervals_expected
le_data_df['num_intervals_missing'] = le_data_df['num_intervals_expected'] - le_data_df['num_intervals']
le_data_df['interval_completeness'] = le_data_df['num_intervals'] / le_data_df['num_intervals_expected']
le_data_df['interval_missingness'] = 1 - le_data_df['interval_completeness']

le_data_df = le_data_df.sort_values(by='interval_completeness')

df_devices_table = le_data_df[['device_id', 'interval_completeness', 'num_intervals_missing']].copy()
df_devices_table['interval_completeness'] = df_devices_table['interval_completeness'] * 100

parameters = [{
  'start_time': time_start,
  'end_time': time_end,
  'num_expected_intervals': num_intervals_expected,
  
}]

top_level_stats = [{
  'num_devices': num_devices,
  'overall_completeness': le_data_df['num_intervals'].sum() / (num_devices * num_intervals_expected) * 100,
  'devices_under_threshold': len(devices_not_meeting_threshold),
  'devices_with_missing_intervals': len(devices_with_missing_le_data),
  'devices_without_data': len(devices_with_empty_le_data),
  'devices_with_failed_retrieval': len(devices_without_le_data),
  'devices_with_complete_data': len(devices_with_complete_le_data),
}]

df_parameters = pd.DataFrame.from_dict(parameters)
df_stats = pd.DataFrame.from_dict(top_level_stats)

In [12]:
# Transform interval data

def flatten_arrays(item: dict) -> dict:
    """ flatten each element of arrays to their own key. Other types of values are left untouched.
    e.g. {key: [value0, value1, ...]} becomes {key_0: value0, key_1: value1, ...}.            
    """
    flattened = {}
    for key, value in item.items():
      if isinstance(value, list):
        for idx, subvalue in enumerate(value):
          flattened[f"{key}_{idx}"] = subvalue
      else:
        flattened[key] = value
    return flattened


# Create an empty DataFrame
intervals = []

for index, row in le_data_df.iterrows():
    # Perform any necessary operations on each row
    # For example, you could print the device_id and interval_completeness
    device_id = row['device_id']
    data = row['le_intervals']
    for item in data:
        row = flatten_arrays(item)
        row["device_id"] = device_id
        intervals.append(row)

df_intervals = pd.DataFrame.from_dict(intervals) 
# Reorder the columns to move 'device_id', 'timestamp', and 'duration' to the front
columns_order = ['device_id', 'timestamp', 'duration'] + [col for col in df_intervals.columns if col not in ['device_id', 'timestamp', 'duration']]
df_intervals = df_intervals[columns_order]



In [13]:
# By-day analysis

df_intervals['datetime'] = pd.to_datetime(df_intervals['timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE)

df_daily_counts = df_intervals.groupby(['device_id', df_intervals['datetime'].dt.date]).size().reset_index(name='entry_count')
df_daily_counts.columns = ['device_id', 'date', 'num_intervals']
df_daily_counts['date'] = pd.to_datetime(df_daily_counts['date']).dt.tz_localize(TIMEZONE)

# Add in missing intervals (set to 0)
date_range = pd.date_range(start=time_start, end=time_end.subtract(seconds=1), freq='D')
all_device_dates = pd.MultiIndex.from_product([devices, date_range], names=['device_id', 'date'])

missing_entries = all_device_dates.difference(df_daily_counts.set_index(['device_id', 'date']).index)
missing_df = pd.DataFrame(list(missing_entries), columns=['device_id', 'date'])
missing_df['num_intervals'] = 0
df_daily_counts = pd.concat([df_daily_counts, missing_df], ignore_index=True)

num_intervals_expected_daily = 24 * 12
df_daily_counts['missing_intervals'] = num_intervals_expected_daily - df_daily_counts['num_intervals']
df_daily_counts['interval_completeness'] = 100 * df_daily_counts['num_intervals'] / num_intervals_expected_daily
# Move interval_completeness column to 3rd column

df_daily_counts['date'] = pd.to_datetime(df_daily_counts['date']).dt.date



cols = df_daily_counts.columns.tolist()
cols.insert(2, cols.pop(cols.index('interval_completeness')))
df_daily_counts = df_daily_counts[cols]



# Outputs

## High level stats

In [None]:
show(df_parameters)
show(df_stats, 
     columnDefs= [
        { "targets": [1], "createdCell": JavascriptFunction(
                f"""
                    function (td, cellData, rowData, row, col) {{
                        if (cellData < {DATA_COMPLETENESS_THRESHOLD}) {{
                            $(td).css('color', 'red')
                        }}
                    }}
                """
        )},
        {
            "targets": [1],
            "render": JavascriptCode("$.fn.dataTable.render.number(',', '.', 2, '', '%')"),
        }
    ],)



## Per device stats

In [None]:
show(df_devices_table, 
     columnDefs= [
        { "targets": [1], "createdCell": JavascriptFunction(
                f"""
                    function (td, cellData, rowData, row, col) {{
                        if (cellData < {DATA_COMPLETENESS_THRESHOLD}) {{
                            $(td).css('color', 'red')
                        }}
                    }}
                """
        )},
        {
            "targets": [1],
            "render": JavascriptCode("$.fn.dataTable.render.number(',', '.', 2, '', '%')"),
        }
    ],
    showIndex=False,
    buttons=["copyHtml5", "csvHtml5", "excelHtml5"]
)


## Per device per day stats

In [None]:
show(df_daily_counts, 
     columnDefs= [
        { "targets": [2], "createdCell": JavascriptFunction(
                f"""
                    function (td, cellData, rowData, row, col) {{
                        if (cellData < {DATA_COMPLETENESS_THRESHOLD}) {{
                            $(td).css('color', 'red')
                        }}
                    }}
                """
        )},
        {
            "targets": [2],
            "render": JavascriptCode("$.fn.dataTable.render.number(',', '.', 2, '', '%')"),
        }
    ],
    showIndex=False,
    pageLength=20,
    buttons=["copyHtml5", "csvHtml5", "excelHtml5"]
)

## Interval data

In [None]:
# NOTE: For large datasets, running this cell may result in memory issues.

show(df_intervals, 
     showIndex=False,
     maxBytes=0,
     pageLength=20,
     buttons=["copyHtml5", "csvHtml5", "excelHtml5"]
     )
