In [1]:
import pandas as pd
import numpy as np
from dateutil.parser import parse
from datetime import timedelta
import tqdm

In [38]:
records = [
    {
        'time': '2019-12-03 12:01:00',
        'reliability': 8,
        'R1':0.2,
        'R2': 0.4,
        'R3':0,
        'R4':0.4,
    },
    {
        'time': '2019-12-03 12:02:00',
        'reliability': 7,
        'R1':0.3,
        'R2': 0.2,
        'R3':0.4,
        'R4':0.1,
    },
    {
        'time': '2019-12-03 12:10:01',
        'reliability': 9,
        'R1':0.3,
        'R2': 0.1,
        'R3':0.2,
        'R4':0.5,
    },
]

records = pd.DataFrame(records)

In [39]:
records.loc[:, 'time'] = pd.to_datetime(records.time)

records.head()

Unnamed: 0,time,reliability,R1,R2,R3,R4
0,2019-12-03 12:01:00,8,0.2,0.4,0.0,0.4
1,2019-12-03 12:02:00,7,0.3,0.2,0.4,0.1
2,2019-12-03 12:10:01,9,0.3,0.1,0.2,0.5


In [40]:
np.unique(records.reliability, return_counts=True)

(array([7, 8, 9]), array([1, 1, 1]))

### Estimate posterior probabilities every 5 min (keep doing this up to T' min)

In [41]:
# change to historical data values
# P(I=1,R1)
def region_prior(region, time):
    # use labels_prv to prevent overfitting
    return {
        'R1':0.01,
        'R2':0.03,
        'R3':0.05,
        'R4':0.04,
    }[region]

np.sum([region_prior(r, 0) for r in np.unique(records.columns) if r.startswith('R')])

0.13

In [42]:
def p(r):
    return r / 10

In [69]:
def calculate_posterior(df, r, time_step=5):
    df.loc[:, 'reliability_1'] = df.loc[:, 'reliability'].apply(p)
    df.drop('reliability', axis=1, inplace=True)
    df.loc[:, 'reliability_0'] = 1 - df.loc[:, 'reliability_1']
    time_steps_records = df.set_index('time').resample(f'{time_step}T')
    matrix = time_steps_records.apply(np.prod)
    p_w_1 = matrix.reliability_1
    matrix.drop('reliability_1', axis=1, inplace=True)
    p_w_0 = matrix.reliability_0
    matrix.drop('reliability_0', axis=1, inplace=True)
    print(matrix)
    print()
    # Calculate region priors    
    prior = np.array([region_prior(c, matrix.index[0]) for c in matrix])
    p_1 = np.sum(prior)
    p_0 = 1 - p_1
    print(f'p(I=1) = {p_1}')
    p_1_w = p_w_1.iloc[0] * p_1 / (p_w_1.iloc[0] * p_1 + p_w_0.iloc[0] * p_0)
    p_0_w = 1 - p_1_w
    print(f'P(I=1|w)={p_1_w}; P(I=0|w)={p_0_w}')
    matrix.iloc[0, :] = p_1_w * (matrix.iloc[0, :] * prior / np.sum(matrix.iloc[0, :] * prior))
    print(f'Posterior (Total): \n{matrix.iloc[0, :]}')
    for ridx in range(1, matrix.shape[0]):
        print('\n' + '_' * 60)
        prior = matrix.iloc[ridx - 1, :]
        print(f'\nPriors:\n{prior}')
        p_1, p_0 = p_1_w, 1 - p_1_w
        print(f'\np(I=1) = {p_1}')
        p_1_w = p_w_1.iloc[ridx] * p_1 / (p_w_1.iloc[ridx] * p_1 + p_w_0.iloc[ridx] * p_0)
        print(f'P(I=1|w)={p_1_w}; P(I=0|w)={1-p_1_w}')
        matrix.iloc[ridx, :] = p_1_w * (matrix.iloc[ridx, :] * prior / np.sum(matrix.iloc[ridx, :] * prior))
        print(f'\nPosterior (Total): \n{matrix.iloc[ridx, :]}')
    # Return posteriors
    return matrix.index, matrix.loc[:, r]

In [70]:
def incident_posterior(df, incident_interval=25):
    incident_interval = timedelta(minutes=incident_interval)
    features = []
    incident_id = 0
    for col in tqdm.tqdm(df.columns):
        # for each region
        if col.startswith('R1'):
            df_region = df.loc[df[col] > 0, :]
            incident_time = np.min(df_region.time)
            while incident_time <= np.max(df_region.time):
                print(incident_time)
                incident_records = df_region[(incident_time <= df_region.time) & (df_region.time < (incident_time + incident_interval))]
                time_steps, posterior_probs = calculate_posterior(incident_records, col)
                features_temp = []
                for time_step, posterior_proba in zip(time_steps, posterior_probs):
                    features_temp.append({
                        'incident_id': incident_id, 
                        'start_time': incident_time,
                        'end_time': incident_time + incident_interval,
                        'time': time_step, 
                        'region': col,
                        'posterior_proba': posterior_proba,
                    })
                features += features_temp
                incident_time = np.min(df_region[(incident_time + incident_interval) <= df_region.time].time)
                incident_id = incident_id + 1
    return pd.DataFrame(features)

features = incident_posterior(records)

features.head()

100%|██████████| 6/6 [00:00<00:00, 114.46it/s]

2019-12-03 12:01:00
                       R1    R2   R3    R4
time                                      
2019-12-03 12:00:00  0.06  0.08  0.0  0.04
2019-12-03 12:05:00  1.00  1.00  1.0  1.00
2019-12-03 12:10:00  0.30  0.10  0.2  0.50

p(I=1) = 0.13
P(I=1|w)=0.5824; P(I=0|w)=0.41759999999999997
Posterior (Total): 
R1    0.075965
R2    0.303861
R3    0.000000
R4    0.202574
Name: 2019-12-03 12:00:00, dtype: float64

____________________________________________________________

Priors:
R1    0.075965
R2    0.303861
R3    0.000000
R4    0.202574
Name: 2019-12-03 12:00:00, dtype: float64

p(I=1) = 0.5824
P(I=1|w)=0.5824; P(I=0|w)=0.41759999999999997

Posterior (Total): 
R1    0.075965
R2    0.303861
R3    0.000000
R4    0.202574
Name: 2019-12-03 12:05:00, dtype: float64

____________________________________________________________

Priors:
R1    0.075965
R2    0.303861
R3    0.000000
R4    0.202574
Name: 2019-12-03 12:05:00, dtype: float64

p(I=1) = 0.5824
P(I=1|w)=0.926208651399491; P(I=0




Unnamed: 0,incident_id,start_time,end_time,time,region,posterior_proba
0,0,2019-12-03 12:01:00,2019-12-03 12:26:00,2019-12-03 12:00:00,R1,0.075965
1,0,2019-12-03 12:01:00,2019-12-03 12:26:00,2019-12-03 12:05:00,R1,0.075965
2,0,2019-12-03 12:01:00,2019-12-03 12:26:00,2019-12-03 12:10:00,R1,0.136654
