In [5]:
import pandas as pd
import os
import io
import datetime

In [4]:
start_time = '2014-04-01'
end_time = '2014-04-07'
one_week_data = pd.read_csv('one_week_data.csv')
one_week_data.head(5)

Unnamed: 0.1,Unnamed: 0,SensorID,Attribute1,Attribute2,Value,SensorType,Timestamp
0,6431809,BATP105,Ignore,Ignore,36,Control4-BatteryPercent,2014-04-01 02:43:32.162533
1,6431810,BATP103,Ignore,Ignore,48,Control4-BatteryPercent,2014-04-01 03:21:14.529900
2,6431811,T105,Ignore,KitchenTemp,25,Control4-Temperature,2014-04-01 04:12:48.248098
3,6431812,BATP022,Ignore,Ignore,42,Control4-BatteryPercent,2014-04-01 05:09:01.748707
4,6431813,BATP104,Ignore,Ignore,45,Control4-BatteryPercent,2014-04-01 05:48:18.736504


In [11]:
one_week_data.columns

Index(['Unnamed: 0', 'SensorID', 'Attribute1', 'Attribute2', 'Value',
       'SensorType', 'Timestamp'],
      dtype='object')

In [6]:
the_day = '2014-04-05'

# 1. Convert the_day string to a datetime object
date_obj = datetime.datetime.strptime(the_day, '%Y-%m-%d')

# 2. Calculate 9 PM of the given date
nine_pm_current_day = date_obj.replace(hour=21, minute=0, second=0, microsecond=0)

# 3. Calculate 9 PM of one day earlier
one_day_earlier = date_obj - datetime.timedelta(days=1)
nine_pm_one_day_earlier = one_day_earlier.replace(hour=21, minute=0, second=0, microsecond=0)

# 4. Get the timestamps (Unix timestamps)
timestamp_nine_pm_one_day_earlier = nine_pm_one_day_earlier.timestamp()
timestamp_nine_pm_current_day = nine_pm_current_day.timestamp()

print(f"Date used: {the_day}")
print(f"9 PM one day earlier ({nine_pm_one_day_earlier}): {timestamp_nine_pm_one_day_earlier}")
print(f"9 PM of the date ({nine_pm_current_day}): {timestamp_nine_pm_current_day}")



Date used: 2014-04-05
9 PM one day earlier (2014-04-04 21:00:00): 1396616400.0
9 PM of the date (2014-04-05 21:00:00): 1396702800.0


In [9]:
# Convert the 'Timestamp' column to datetime objects
one_week_data['Timestamp'] = pd.to_datetime(one_week_data['Timestamp'])

one_day_data = one_week_data[(one_week_data['Timestamp'] >= nine_pm_one_day_earlier) 
    & (one_week_data['Timestamp'] <= nine_pm_current_day)]
one_day_data.head(10)

Unnamed: 0.1,Unnamed: 0,SensorID,Attribute1,Attribute2,Value,SensorType,Timestamp
2729,6434538,LS008,Ignore,Ignore,24,Control4-LightSensor,2014-04-04 21:00:36.206586
2730,6434539,BATP008,Ignore,Ignore,63,Control4-BatteryPercent,2014-04-04 21:00:36.258214
2731,6434540,LS007,Ignore,Ignore,8,Control4-LightSensor,2014-04-04 21:00:44.880928
2732,6434541,M007,Kitchen,Kitchen,ON,Control4-Motion,2014-04-04 21:00:44.941567
2733,6434542,LS007,Ignore,Ignore,9,Control4-LightSensor,2014-04-04 21:00:46.945409
2734,6434543,M007,Kitchen,Kitchen,OFF,Control4-Motion,2014-04-04 21:00:46.976946
2735,6434544,M007,Kitchen,Kitchen,ON,Control4-Motion,2014-04-04 21:00:53.932224
2736,6434545,LS007,Ignore,Ignore,8,Control4-LightSensor,2014-04-04 21:00:55.007836
2737,6434546,M007,Kitchen,Kitchen,OFF,Control4-Motion,2014-04-04 21:00:55.039010
2738,6434547,M007,Kitchen,Kitchen,ON,Control4-Motion,2014-04-04 21:00:56.700078


In [10]:
door_data = one_day_data[one_day_data['SensorType'] == 'Control4-Door']
door_data.head()

Unnamed: 0.1,Unnamed: 0,SensorID,Attribute1,Attribute2,Value,SensorType,Timestamp
3500,6435309,D002,OutsideDoor,FrontDoor,OPEN,Control4-Door,2014-04-05 09:07:39.246137
3501,6435310,D002,OutsideDoor,FrontDoor,CLOSE,Control4-Door,2014-04-05 09:07:51.505899
3536,6435345,D002,OutsideDoor,FrontDoor,OPEN,Control4-Door,2014-04-05 10:04:49.800871
3546,6435355,D002,OutsideDoor,FrontDoor,CLOSE,Control4-Door,2014-04-05 10:06:48.451280
6457,6438266,D002,OutsideDoor,FrontDoor,OPEN,Control4-Door,2014-04-05 13:07:45.997261


### Check if all open event has a close event
How it Works:
Vectorized Mapping: Instead of iterating, df['Value'].map({'OPEN': 1, 'CLOSE': -1}) quickly creates a numerical representation for each event type.
groupby() and cumsum(): This is the core optimization.
df.groupby(['SensorID', 'Attribute1', 'Attribute2']) creates groups for each unique door.
.cumsum() is then applied within each group. This means for each door, it calculates a running total. An 'OPEN' adds 1, a 'CLOSE' subtracts 1.
The current_state column effectively tracks the "open balance" for each door at any given point in time.
Detecting Unmatched 'CLOSE' Events: If current_state ever drops below zero, it means a 'CLOSE' event occurred when the door was already closed (or there were too many 'CLOSE' events for the number of 'OPEN' events recorded).
Detecting Unmatched 'OPEN' Events: We find the last() current_state for each door. If this final_state is greater than zero, it means that many 'OPEN' events remain unmatched by a 'CLOSE' event.
Identifying Specific Unmatched 'OPEN' Events: If N 'OPEN' events are unmatched for a door (where N is the final_state), the most logical interpretation is that the last N 'OPEN' events (in chronological order) are the ones that were not closed. The code specifically retrieves these.

In [13]:
# 1. Convert 'Timestamp' to datetime objects
# df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# 2. Sort by door identifiers and Timestamp for correct cumulative sum
door_data = door_data.sort_values(by=['SensorID', 'Attribute1', 'Attribute2', 'Timestamp']).reset_index(drop=True)

# 3. Assign numerical values: OPEN = 1, CLOSE = -1
#    Other 'Value' types (if any) will become 0, not affecting the sum
door_data['event_val'] = door_data['Value'].map({'OPEN': 1, 'CLOSE': -1}).fillna(0)

# 4. Calculate cumulative sum of event_val within each door group
#    This 'current_state' represents the net number of 'OPEN' events that are active.
door_data['current_state'] = door_data.groupby(['SensorID', 'Attribute1', 'Attribute2'])['event_val'].cumsum()

# --- Detection Logic ---

# 5. Detect 'CLOSE' events without a preceding 'OPEN' (state drops below 0)
#    These are events where `current_state` becomes negative.
unmatched_closes = door_data[door_data['current_state'] < 0]
if not unmatched_closes.empty:
    print("Warning: 'CLOSE' events found without a preceding 'OPEN' for the same door:")
    print(unmatched_closes[['SensorID', 'Attribute1', 'Attribute2', 'Value', 'Timestamp', 'current_state']].to_markdown(index=False, numalign="left", stralign="left"))
    # To ensure subsequent 'OPEN' unmatched detection is not skewed,
    # you might want to "correct" the state by clipping it to 0 for these groups
    # df['current_state'] = df.groupby(['SensorID', 'Attribute1', 'Attribute2'])['event_val'].transform(lambda x: x.cumsum().clip(lower=0))
    # For this problem, we'll just report and continue.

# 6. Detect 'OPEN' events without a subsequent 'CLOSE' (final state for a door > 0)
#    Get the last state for each door group
last_states_per_door = door_data.groupby(['SensorID', 'Attribute1', 'Attribute2']).agg(
    final_state=('current_state', 'last'),
    last_timestamp=('Timestamp', 'last')
)

unmatched_open_doors_summary = last_states_per_door[last_states_per_door['final_state'] > 0]

if not unmatched_open_doors_summary.empty:
    print("\nDoor(s) with unmatched 'OPEN' events (not all OPENs have a corresponding CLOSE):")
    print(unmatched_open_doors_summary[['final_state', 'last_timestamp']].to_markdown(numalign="left", stralign="left"))

    # To get the *specific* 'OPEN' events that are unmatched:
    # If a door's final_state is N > 0, it means the last N 'OPEN' events are effectively unmatched.
    unmatched_open_events_list = []
    for (sensor_id, attr1, attr2), row_data in unmatched_open_doors_summary.iterrows():
        num_unmatched = int(row_data['final_state'])

        # Get all 'OPEN' events for this specific door, sorted in reverse chronological order
        door_open_events = df[
            (df['SensorID'] == sensor_id) &
            (df['Attribute1'] == attr1) &
            (df['Attribute2'] == attr2) &
            (df['Value'] == 'OPEN')
        ].sort_values('Timestamp', ascending=False)

        # Append the last 'num_unmatched' OPEN events to our list
        if not door_open_events.empty:
            unmatched_open_events_list.append(door_open_events.head(num_unmatched))

    if unmatched_open_events_list:
        detailed_unmatched_df = pd.concat(unmatched_open_events_list).sort_values('Timestamp')
        print("\nDetailed list of specific unmatched 'OPEN' events:")
        print(detailed_unmatched_df[['SensorID', 'Attribute1', 'Attribute2', 'Value', 'Timestamp']].to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("\nAll 'OPEN' events have a corresponding 'CLOSE' event, and no 'CLOSE' events occurred without a preceding 'OPEN'.")


All 'OPEN' events have a corresponding 'CLOSE' event, and no 'CLOSE' events occurred without a preceding 'OPEN'.


In [15]:
last_states_per_door.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_state,last_timestamp
SensorID,Attribute1,Attribute2,Unnamed: 3_level_1,Unnamed: 4_level_1
D002,OutsideDoor,FrontDoor,0,2014-04-05 18:42:21.015943


### Calculate the door open time for each open-close event and at the end to check if the door is open at end of the day

In [18]:
# 1. Convert 'Timestamp' column to datetime objects
# df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# 2. Filter for only 'OPEN' and 'CLOSE' events, as other events are irrelevant for duration calculation
#    Then, sort the DataFrame by door identifiers and Timestamp to ensure correct chronological order
df_events = door_data[door_data['Value'].isin(['OPEN', 'CLOSE'])].sort_values(
    by=['SensorID', 'Attribute1', 'Attribute2', 'Timestamp']
).reset_index(drop=True)

# 3. Define a function to process each door group and find open-close pairs
def calculate_door_durations(group):
    # Retrieve the grouping keys (SensorID, Attribute1, Attribute2) from group.name
    # group.name will be a tuple like ('D002', 'OutsideDoor', 'FrontDoor')
    sensor_id, attribute1, attribute2 = group.name
    
    open_event_queue = [] 
    durations = []
    
    for index, row in group.iterrows():
        if row['Value'] == 'OPEN':
            open_event_queue.append(row['Timestamp'])
        elif row['Value'] == 'CLOSE':
            if open_event_queue: 
                open_time = open_event_queue.pop(0) 
                close_time = row['Timestamp']
                
                duration = close_time - open_time
                
                durations.append({
                    'SensorID': sensor_id,       # Use the retrieved grouping key
                    'Attribute1': attribute1,    # Use the retrieved grouping key
                    'Attribute2': attribute2,    # Use the retrieved grouping key
                    'OpenTime': open_time,
                    'CloseTime': close_time,
                    'Duration': duration
                })
    
    return pd.DataFrame(durations)

# 4. Apply the function to each door group using groupby().apply()
#    Added `include_groups=False` to silence the DeprecationWarning.
all_door_durations = df_events.groupby(['SensorID', 'Attribute1', 'Attribute2']).apply(
    calculate_door_durations,
    include_groups=False # This is the fix for the DeprecationWarning
)

# Reset the index to flatten the DataFrame after the groupby.apply() operation
if not all_door_durations.empty:
    all_door_durations = all_door_durations.reset_index(drop=True)
    print("Calculated door open/close durations:")
    print(all_door_durations.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("No complete OPEN-CLOSE pairs found to calculate durations.")

# Optional: Re-run the check for unmatched 'OPEN' events (from previous answer)
print("\n--- Checking for Unmatched OPEN Events (for completeness) ---")
df_events['event_val'] = df_events['Value'].map({'OPEN': 1, 'CLOSE': -1})
df_events['current_state'] = df_events.groupby(['SensorID', 'Attribute1', 'Attribute2'])['event_val'].cumsum()

last_states_per_door = df_events.groupby(['SensorID', 'Attribute1', 'Attribute2']).agg(
    final_state=('current_state', 'last')
)

unmatched_open_doors_summary = last_states_per_door[last_states_per_door['final_state'] > 0]

if not unmatched_open_doors_summary.empty:
    print("\nNote: The following doors have 'OPEN' events that were not closed (as identified previously):")
    print(unmatched_open_doors_summary.to_markdown(numalign="left", stralign="left"))
else:
    print("\nAll 'OPEN' events have a corresponding 'CLOSE' event.")

Calculated door open/close durations:
| SensorID   | Attribute1   | Attribute2   | OpenTime                   | CloseTime                  | Duration               |
|:-----------|:-------------|:-------------|:---------------------------|:---------------------------|:-----------------------|
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 09:07:39.246137 | 2014-04-05 09:07:51.505899 | 0 days 00:00:12.259762 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 10:04:49.800871 | 2014-04-05 10:06:48.451280 | 0 days 00:01:58.650409 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:07:45.997261 | 2014-04-05 13:07:57.695227 | 0 days 00:00:11.697966 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:12:58.632338 | 2014-04-05 13:13:11.841618 | 0 days 00:00:13.209280 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:17:23.148399 | 2014-04-05 13:17:42.529245 | 0 days 00:00:19.380846 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:18:02.7

### Check if door open is longer then 5 minutes

In [19]:

# df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df_events = door_data[door_data['Value'].isin(['OPEN', 'CLOSE'])].sort_values(
    by=['SensorID', 'Attribute1', 'Attribute2', 'Timestamp']
).reset_index(drop=True)

def calculate_door_durations(group):
    sensor_id, attribute1, attribute2 = group.name
    open_event_queue = [] 
    durations = []
    
    for index, row in group.iterrows():
        if row['Value'] == 'OPEN':
            open_event_queue.append(row['Timestamp'])
        elif row['Value'] == 'CLOSE':
            if open_event_queue: 
                open_time = open_event_queue.pop(0) 
                close_time = row['Timestamp']
                duration = close_time - open_time
                durations.append({
                    'SensorID': sensor_id,
                    'Attribute1': attribute1,
                    'Attribute2': attribute2,
                    'OpenTime': open_time,
                    'CloseTime': close_time,
                    'Duration': duration
                })
    return pd.DataFrame(durations)

all_door_durations = df_events.groupby(['SensorID', 'Attribute1', 'Attribute2']).apply(
    calculate_door_durations,
    include_groups=False
)

if not all_door_durations.empty:
    all_door_durations = all_door_durations.reset_index(drop=True)
    print("All calculated door open/close durations:")
    print(all_door_durations.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("No complete OPEN-CLOSE pairs found to calculate durations.")


# --- Code to get timestamps where door open time is longer than 5 minutes ---

# Define the threshold duration
five_minutes = pd.Timedelta(minutes=5)

# Filter the DataFrame where 'Duration' is greater than 5 minutes
long_open_periods = all_door_durations[all_door_durations['Duration'] > five_minutes]

if not long_open_periods.empty:
    print(f"\nDoor Open Periods Longer Than {five_minutes}:")
    print(long_open_periods.to_markdown(index=False, numalign="left", stralign="left"))

    # Extract just the OpenTime timestamps for these periods
    long_open_timestamps = long_open_periods['OpenTime']
    
    print(f"\nTimestamps when door open time was longer than {five_minutes}:")
    # Print the Series directly for a clean list of timestamps
    print(long_open_timestamps.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print(f"\nNo door open periods found that are longer than {five_minutes}.")

All calculated door open/close durations:
| SensorID   | Attribute1   | Attribute2   | OpenTime                   | CloseTime                  | Duration               |
|:-----------|:-------------|:-------------|:---------------------------|:---------------------------|:-----------------------|
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 09:07:39.246137 | 2014-04-05 09:07:51.505899 | 0 days 00:00:12.259762 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 10:04:49.800871 | 2014-04-05 10:06:48.451280 | 0 days 00:01:58.650409 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:07:45.997261 | 2014-04-05 13:07:57.695227 | 0 days 00:00:11.697966 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:12:58.632338 | 2014-04-05 13:13:11.841618 | 0 days 00:00:13.209280 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:17:23.148399 | 2014-04-05 13:17:42.529245 | 0 days 00:00:19.380846 |
| D002       | OutsideDoor  | FrontDoor    | 2014-04-05 13:18: