In [None]:
import time
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
plt.rcParams["font.family"] = "Linux Libertine O"
plt.rcParams["font.size"] = 20

In [None]:
files = 5
if isinstance(files, int):
    filtered = pd.read_csv(f'../share/tpms-merged-{files}.csv')
else:
    filtered = pd.read_csv(f'../share/tpms-{files}.csv')
filtered.index = pd.to_datetime(filtered['time'])

In [None]:
cars = pd.read_csv('../data/cars.csv')
cars_np = cars[['id1','id2','id3','id4','id5','id6','id7','id8']].to_numpy(dtype=str).flatten()
cars_np = cars_np[~(cars_np == 'nan')]

In [None]:
def jaccard(
    id1: str, id2: str, dataset: pd.DataFrame, agg: str = "1T"
) -> Tuple[int, int, float]:
    """
    Calculate the Jaccard index for a given pair of IDs

    ## Parameters:
    id1: First id
    id2: Second id
    dataset: Dataset with the given IDs
    ## Returns
    pd.Dataframe with the Jaccard index of the ID with the rest of the indices
    """

    # We convert 'time' to datetime if it's not already
    if not isinstance(dataset.index, pd.DatetimeIndex):
        dataset["time"] = pd.to_datetime(dataset["time"])
        dataset.set_index("time", inplace=True)

    # Group by the specified time window
    grouped = dataset.groupby([pd.Grouper(freq=agg), "id"]).size().unstack(fill_value=0)

    # Identify time windows where the target ID appears
    try:
        t1_window = grouped[id1] > 0
    except:  # noqa: E722
        # print("ID1 not found on dataset")
        return (0,0,0.0)

    try:
        t2_window = grouped[id2] > 0
    except:  # noqa: E722
        # print("ID2 not found on dataset")
        return (t1_window.sum(),0,0.0)

    p_A = t1_window.sum()
    p_B = t2_window.sum()

    p_AiB = (t1_window & t2_window).sum()

    return (p_A, p_B, p_AiB, (p_AiB) / (p_A + p_B - p_AiB))

In [None]:
def find_id_in_window(target_id, target_times, df, window='30S'):
    # Create a Timedelta window
    timedelta = pd.Timedelta(window)

    # For each time the target id appears, look in the time window and count ids
    count = 0
    for time in target_times:
        start_time = time - timedelta
        end_time = time + timedelta
        
        timerange = (df.index > start_time) & (df.index < end_time)
        count += df.loc[timerange, target_id].any()

    return count

In [None]:
# Group by the specified time window
grouped = filtered.groupby([pd.Grouper(freq='1S'), "id"]).size().unstack(fill_value=0)
t1 = grouped[grouped[cars_np[1]] > 0].index
find_id_in_window(cars_np[0],t1,grouped,window='30S')

In [None]:
def jaccard2(
    id1: str, id2: str, dataset: pd.DataFrame, agg: str = "30S"
) -> Tuple[int, int, float]:
    """
    Calculate the Jaccard index for a given pair of IDs

    ## Parameters:
    id1: First id
    id2: Second id
    dataset: Dataset with the given IDs
    ## Returns
    pd.Dataframe with the Jaccard index of the ID with the rest of the indices
    """

    # Group by the specified time window
    grouped = dataset.groupby([pd.Grouper(freq='1S'), "id"]).size().unstack(fill_value=0)

    # Identify time windows where the target ID appears
    try:
        t1_window = grouped[id1] > 0
        t1 = grouped[t1_window].index
    except:  # noqa: E722
        # print("ID1 not found on dataset")
        return (0,0,0.0)

    try:
        t2_window = grouped[id2] > 0
    except:  # noqa: E722
        # print("ID2 not found on dataset")
        return (t1_window.sum(),0,0.0)

    p_A = t1_window.sum()
    p_B = t2_window.sum()
    
    p_AiB = find_id_in_window(id2,t1,grouped,window=agg)

    return (p_A, p_B, p_AiB, (p_AiB) / (p_A + p_B - p_AiB))

In [None]:
jaccard2(cars_np[8], cars_np[10], filtered, agg='30S')

In [None]:
grouping_analysis = [['5S','5 sec.'],['10S','10 sec.'],['30S','30 sec.'],['1T','1 min.'],['2T','2 min.']]

In [None]:
selected = 3

scores = np.zeros((len(cars_np),len(cars_np)))
countm = np.zeros((len(cars_np),len(cars_np)))
countn = np.zeros((len(cars_np),len(cars_np)))
overlp = np.zeros((len(cars_np),len(cars_np)))
for i in range(len(cars_np)):
    for j in range(i,len(cars_np)):
        jac = jaccard(cars_np[i].upper(), cars_np[j].upper(), filtered, agg=grouping_analysis[selected][0])
        countm[i,j] = jac[0]
        countn[i,j] = jac[1]
        overlp[i,j] = jac[2]
        scores[i,j] = jac[3]

In [None]:
scores_db = 10*np.log10(scores+1e-6)
scores_db[scores_db < -30] = -30

i_lower = np.tril_indices(scores_db.shape[0], -1)
scores[i_lower] = scores.T[i_lower]
scores_db[i_lower] = scores_db.T[i_lower]
countm[i_lower] = countm.T[i_lower]
countn[i_lower] = countn.T[i_lower]
overlp[i_lower] = overlp.T[i_lower]

In [None]:
f, ax = plt.subplots(figsize=(15, 10))

im = ax.imshow(scores_db, cmap='inferno')
ax.set_xticks([])
ax.set_yticks([])
# ax.set_title(f'Aggregation: {grouping_analysis[selected][1]}')

cbar = f.colorbar(im, ax=ax)
cbar.set_label('Log Probability')

# plt.savefig(f'../pics/correlation/jaccard/jaccard_{grouping_analysis[selected][0]}_n{files}.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
wheel_loc = {0: 'FL', 25: 'FR', 50: 'RR', 75: 'RL'}

In [None]:
index = 3*4 + 0
sorted_indices = np.argsort(scores[index,:])
jaccs = np.array([
    np.flip(scores[index,sorted_indices])[0:5],
    np.flip(countm[index,sorted_indices])[0:5],
    np.flip(countn[index,sorted_indices])[0:5],
    np.flip(overlp[index,sorted_indices])[0:5],
    np.flip(sorted_indices)[0:5],
])

table = pd.DataFrame(jaccs.T, columns=['Jacc', 'M', 'N', 'Overlap', 'CarF'])
table['Target'] = f'C{int(index//4+1)}-{wheel_loc[int((index/4 - index//4)*100)]}'
table['Other'] = table['CarF'].apply(lambda x: f'C{int(x//4 + 1)}-{wheel_loc[int((x/4-x//4)*100)]}')
table[['Target','Other','M','N','Overlap','Jacc']].loc[1:,:]

In [None]:
table.loc[1:,:].to_clipboard()

## Timing Analysis

In [None]:
grouping_analysis = [['5S','5 sec.'],['10S','10 sec.'],['30S','30 sec.'],['1T','1 min.'],['2T','2 min.']]

In [None]:
selected = 0
t = []

for i in range(100):
    start = time.time()
    scores = np.zeros((len(cars_np),len(cars_np)))
    countm = np.zeros((len(cars_np),len(cars_np)))
    countn = np.zeros((len(cars_np),len(cars_np)))
    overlp = np.zeros((len(cars_np),len(cars_np)))
    for i in range(len(cars_np)):
        for j in range(i,len(cars_np)):
            jac = jaccard(cars_np[i].upper(), cars_np[j].upper(), filtered, agg=grouping_analysis[selected][0])
            countm[i,j] = jac[0]
            countn[i,j] = jac[1]
            overlp[i,j] = jac[2]
            scores[i,j] = jac[3]
    ending = time.time()

    scores_db = 10*np.log10(scores+1e-6)
    scores_db[scores_db < -30] = -30

    i_lower = np.tril_indices(scores_db.shape[0], -1)
    scores[i_lower] = scores.T[i_lower]
    scores_db[i_lower] = scores_db.T[i_lower]
    countm[i_lower] = countm.T[i_lower]
    countn[i_lower] = countn.T[i_lower]
    overlp[i_lower] = overlp.T[i_lower]
    t.append(ending - start)

In [None]:
np.mean(np.array(t))