In [None]:
import time
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
plt.rcParams["font.family"] = "Linux Libertine O"
plt.rcParams["font.size"] = 20

In [None]:
filtered = pd.read_csv('../share/tpms-merged-5.csv')
filtered.index = pd.to_datetime(filtered['time'])

In [None]:
cars = pd.read_csv('../data/cars.csv')
cars_np = cars[['id1','id2','id3','id4','id5','id6','id7','id8']].to_numpy(dtype=str).flatten()
cars_np = cars_np[~(cars_np == 'nan')]
cars_np = np.random.permutation(cars_np)

In [None]:
def jaccard(
    id1: str, id2: str, dataset: pd.DataFrame, agg: str = "1T"
) -> Tuple[int, int, float]:
    """
    Calculate the Jaccard index for a given pair of IDs

    ## Parameters:
    id1: First id
    id2: Second id
    dataset: Dataset with the given IDs
    ## Returns
    pd.Dataframe with the Jaccard index of the ID with the rest of the indices
    """

    # We convert 'time' to datetime if it's not already
    if not isinstance(dataset.index, pd.DatetimeIndex):
        dataset["time"] = pd.to_datetime(dataset["time"])
        dataset.set_index("time", inplace=True)

    # Group by the specified time window
    grouped = dataset.groupby([pd.Grouper(freq=agg), "id"]).size().unstack(fill_value=0)

    # Identify time windows where the target ID appears
    try:
        t1_window = grouped[id1] > 0
    except:  # noqa: E722
        # print("ID1 not found on dataset")
        return (0,0,0.0)

    try:
        t2_window = grouped[id2] > 0
    except:  # noqa: E722
        # print("ID2 not found on dataset")
        return (t1_window.sum(),0,0.0)

    p_A = t1_window.sum()
    p_B = t2_window.sum()

    p_AiB = (t1_window & t2_window).sum()

    return (p_A, p_B, p_AiB, (p_AiB) / (p_A + p_B - p_AiB))

In [None]:
def find_id_in_window(target_id, target_times, df, window='30S'):
    # Create a Timedelta window
    timedelta = pd.Timedelta(window)

    # For each time the target id appears, look in the time window and count ids
    count = 0
    for time in target_times:
        start_time = time - timedelta
        end_time = time + timedelta
        
        timerange = (df.index > start_time) & (df.index < end_time)
        count += df.loc[timerange, target_id].any()

    return count

In [None]:
# Group by the specified time window
grouped = filtered.groupby([pd.Grouper(freq='1S'), "id"]).size().unstack(fill_value=0)
grouped = grouped.loc[:,cars_np]
t1 = grouped[grouped[cars_np[5]] > 0].index
find_id_in_window(cars_np[5],t1,grouped,window='30S')

In [None]:
def jaccard2(
    id1: str, id2: str, dataset: pd.DataFrame, agg: str = "30S"
) -> Tuple[int, int, float]:
    """
    Calculate the Jaccard index for a given pair of IDs

    ## Parameters:
    id1: First id
    id2: Second id
    dataset: Dataset with the given IDs
    ## Returns
    pd.Dataframe with the Jaccard index of the ID with the rest of the indices
    """

    # Group by the specified time window
    grouped = dataset.groupby([pd.Grouper(freq='1S'), "id"]).size().unstack(fill_value=0)

    # Identify time windows where the target ID appears
    try:
        t1_window = grouped[id1] > 0
        t1 = grouped[t1_window].index
    except:  # noqa: E722
        # print("ID1 not found on dataset")
        return (0,0,0.0)

    try:
        t2_window = grouped[id2] > 0
    except:  # noqa: E722
        # print("ID2 not found on dataset")
        return (t1_window.sum(),0,0.0)

    p_A = t1_window.sum()
    p_B = t2_window.sum()
    
    p_AiB = find_id_in_window(id2,t1,grouped,window=agg)

    return (p_A, p_B, p_AiB, (p_AiB) / (p_A + p_B - p_AiB))

In [None]:
jaccard2(cars_np[8], cars_np[10], filtered, agg='30S')

## Robust

In [None]:
grouping_analysis = [['5S','5 sec.'],['10S','10 sec.'],['30S','30 sec.'],['1T','1 min.'],['2T','2 min.'],['5T','5 min.']]

In [None]:
selected = 0

scores = np.zeros((len(cars_np),len(cars_np)))
countm = np.zeros((len(cars_np),len(cars_np)))
countn = np.zeros((len(cars_np),len(cars_np)))
overlp = np.zeros((len(cars_np),len(cars_np)))
for i in range(len(cars_np)):
    for j in range(i,len(cars_np)):
        jac = jaccard(cars_np[i].upper(), cars_np[j].upper(), filtered, agg=grouping_analysis[selected][0])
        countm[i,j] = jac[0]
        countn[i,j] = jac[1]
        overlp[i,j] = jac[2]
        scores[i,j] = jac[3]

In [None]:
scores_db = 10*np.log10(scores+1e-6)
scores_db[scores_db < -30] = -30

i_lower = np.tril_indices(scores_db.shape[0], -1)
scores[i_lower] = scores.T[i_lower]
scores_db[i_lower] = scores_db.T[i_lower]
countm[i_lower] = countm.T[i_lower]
countn[i_lower] = countn.T[i_lower]
overlp[i_lower] = overlp.T[i_lower]

pivot_aux = grouped.copy()

corr_threshold = -10.0
# Let's get all cars
cars_ids = np.array(['','','',''], dtype=str)
cars_dic = {}

car_num = 0
while (corr_threshold > -30.0) and (scores_db.shape[0] > 1):
    i = 0
    while i < scores_db.shape[0]:
        row = scores_db[i,:]
        best_ids = row[row > corr_threshold]
        if len(best_ids) > 1:
            actual_ids = np.argsort(row)[::-1][0:min([4,len(best_ids)])]
            id_values = pivot_aux.columns[actual_ids]
            intersection = np.in1d(cars_ids, id_values)

            if (~intersection).any():
                if len(id_values) < 4:
                    filler = [''] * (4-len(id_values))
                    id_values = np.append(id_values,filler)

                if ~(id_values == '').any():
                    pivot_aux = pivot_aux.drop(id_values,axis=1)
                    scores_db = np.delete(scores_db, actual_ids, axis=0)
                    scores_db = np.delete(scores_db, actual_ids, axis=1)
                    cars_dic[car_num] = id_values.to_numpy()
                    car_num+=1
                else:
                    cars_ids = np.vstack((cars_ids,id_values))
            else:
                intersection_opposite = np.in1d(id_values, cars_ids)
                if (~intersection_opposite).any():
                    missing_ids = id_values[~intersection_opposite.any()].to_numpy()
                    remaining = 4-len(id_values)

                    data_row = np.argmax(intersection.reshape(-1,4).sum(axis=1))
                    cars_ids[data_row,remaining:] = missing_ids[:remaining]

                    if ~(cars_ids[data_row,:] == '').any():
                        found_car = cars_ids[data_row,:]
                        pivot_aux = pivot_aux.drop(id_values,axis=1)
                        scores_db = np.delete(scores_db, actual_ids, axis=0)
                        scores_db = np.delete(scores_db, actual_ids, axis=1)

                        cars_dic[car_num] = cars_ids[data_row,:]
                        car_num += 1

                        cars_ids = np.delete(cars_ids, data_row, axis=0)
                        cars_ids[cars_ids.isin(found_car)] = ''
        i += 1
    
    corr_threshold -= 2.5

## Naive

In [None]:
grouping_analysis = [['5S','5 sec.'],['10S','10 sec.'],['30S','30 sec.'],['1T','1 min.'],['2T','2 min.'],['5T','5 min.']]

In [None]:
selected = 4

scores = np.zeros((len(cars_np),len(cars_np)))
countm = np.zeros((len(cars_np),len(cars_np)))
countn = np.zeros((len(cars_np),len(cars_np)))
overlp = np.zeros((len(cars_np),len(cars_np)))
for i in range(len(cars_np)):
    for j in range(i,len(cars_np)):
        jac = jaccard(cars_np[i].upper(), cars_np[j].upper(), filtered, agg=grouping_analysis[selected][0])
        countm[i,j] = jac[0]
        countn[i,j] = jac[1]
        overlp[i,j] = jac[2]
        scores[i,j] = jac[3]

In [None]:
scores_db = 10*np.log10(scores+1e-6)
scores_db[scores_db < -30] = -30

i_lower = np.tril_indices(scores_db.shape[0], -1)
scores[i_lower] = scores.T[i_lower]
scores_db[i_lower] = scores_db.T[i_lower]
countm[i_lower] = countm.T[i_lower]
countn[i_lower] = countn.T[i_lower]
overlp[i_lower] = overlp.T[i_lower]

pivot_aux = grouped.copy()

corr_threshold = -20.0
# Let's get all cars
cars_ids = np.array(['','','',''], dtype=str)
cars_dic = {}

car_num = 0
i = 0
while i < scores_db.shape[0]:
    row = scores_db[i,:]
    best_ids = row[row > corr_threshold]
    if len(best_ids) > 1:
        actual_ids = np.argsort(row)[::-1][0:min([4,len(best_ids)])]
        id_values = pivot_aux.columns[actual_ids]
        intersection = np.in1d(cars_ids, id_values)

        if (~intersection).any():
            if len(id_values) < 4:
                filler = [''] * (4-len(id_values))
                id_values = np.append(id_values,filler)

            if ~(id_values == '').any():
                pivot_aux = pivot_aux.drop(id_values,axis=1)
                scores_db = np.delete(scores_db, actual_ids, axis=0)
                scores_db = np.delete(scores_db, actual_ids, axis=1)
                cars_dic[car_num] = id_values.to_numpy()
                car_num+=1
            else:
                cars_ids = np.vstack((cars_ids,id_values))
        else:
            intersection_opposite = np.in1d(id_values, cars_ids)
            if (~intersection_opposite).any():
                missing_ids = id_values[~intersection_opposite.any()].to_numpy()
                remaining = 4-len(id_values)

                data_row = np.argmax(intersection.reshape(-1,4).sum(axis=1))
                cars_ids[data_row,remaining:] = missing_ids[:remaining]

                if ~(cars_ids[data_row,:] == '').any():
                    found_car = cars_ids[data_row,:]
                    pivot_aux = pivot_aux.drop(id_values,axis=1)
                    scores_db = np.delete(scores_db, actual_ids, axis=0)
                    scores_db = np.delete(scores_db, actual_ids, axis=1)

                    cars_dic[car_num] = cars_ids[data_row,:]
                    car_num += 1

                    cars_ids = np.delete(cars_ids, data_row, axis=0)
                    cars_ids[cars_ids.isin(found_car)] = ''
    i += 1