Dataset contains 4 folder each containing .svg file of the map and a .txt file which contains the data of the trajectories. The data consists of tag_id in alphanumeric format, date and time in the format "YYYY-MM-DD HH:MM:SS", x coordinate value upto decimal, y coordinate value upto second decimal, description of the tag.

Path to the dataset is set here

In [None]:
path = "data/german_1/german_1.txt"
output_dir = "output/"
path_new = "output/german_1_new.txt"
path_new_without_rest = "output/german_1_new_without_rest.txt"

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
from tqdm.notebook import tqdm
from matplotlib.pyplot import figure
figure(figsize=(18, 12), dpi=100)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from datetime import date
# reading data
data = pd.read_csv(path, sep=";")

In [None]:
data.shape

In [None]:
data.tag_id = data.tag_id.astype('category') #changing to categorical data
data.time = pd.to_datetime(data.time)

In [None]:
data.info()

Removing Duplicate Entries

total duplicate rows

In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates()

Removing duplicate entries based on time.

In [None]:
data.duplicated(subset=['time',"tag_id"]).sum()

In [None]:
print(data[data.duplicated(subset=['time',"tag_id"])]) #delete duplicate enteries with the same tag_id and timestamp

In [None]:
data = data.drop_duplicates(subset=['time',"tag_id"], keep='first')

In [None]:
def plotDataGermanStore1(ax, plotFunc = None):
    bg_img = mpimg.imread('data/german_1/storePlan_fix.jpg')
    
    ax.xaxis.tick_top()

    ax.imshow(bg_img, extent=[0, 45.6, 0, 26.7], origin="lower", aspect="equal")

    ax.set_xlim(-1, 44.56)
    ax.set_ylim(26.40, -1)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

plotDataGermanStore1(ax) #ploting raw data
ax.scatter(data.x,data.y, s = 0.00005)
plt.show()

Describing the data

In [None]:
print("Description")
print(data.describe())

Describing data with non-negative values.

In [None]:
non_neg_data = data[(data['x'] >= 0) & (data['y'] >= 0)]

In [None]:
print("Description")
print(non_neg_data.describe())

Printing the unique tag_id present in the dataset

In [None]:
print(data['tag_id'].unique())

In [None]:
print("Unique tag_ids in this dataset are: ",data['tag_id'].unique().shape[0])

Histograms for the coordinate values

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(data['x'])
plt.show()

In [None]:
negative_x = data[data['x'] < 0]

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

sns.histplot(negative_x['x'], ax=axs[0])
axs[0].set_title("Histogram of x coordinate for x < 0")
sns.histplot(negative_x['y'], ax=axs[1])
axs[1].set_title("Histogram of y coordinate for x < 0")

plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data['y'])
plt.show()

In [None]:
negative_y = data[data['y'] < 0]

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

sns.histplot(negative_y['x'], ax=axs[0])
axs[0].set_title("Histogram of x coordinate for y < 0")
sns.histplot(negative_y['y'], ax=axs[1])
axs[1].set_title("Histogram of y coordinate for y < 0")

plt.show()


Scatter plots for understanding the distribution

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

plotDataGermanStore1(ax) #ploting raw data
ax.scatter(data.x,data.y, s = 0.00005)
plt.show()

Scatter plot for x and y with different hue for tag_ids

Plotting separate for each tag_id

In [None]:
tag_id_num = 6 #number of tag ids to make the graphs

selected_tag_ids = np.random.choice(data['tag_id'].unique(), tag_id_num)

fig, ax = plt.subplots(nrows=int(np.ceil(len(selected_tag_ids) / 2)), ncols=2, figsize=(20, 100/len(selected_tag_ids)))
ax = ax.ravel()
for i, tag_id in enumerate(selected_tag_ids):
    plotDataGermanStore1(ax[i]) #ploting raw data
    tag_data = data[data['tag_id'] == tag_id]
    ax[i].scatter(x=tag_data.x, y=tag_data.y, s = 0.03, label=tag_id)
    ax[i].set_title("Tag ID: " + tag_id)
plt.tight_layout()
plt.show()

Generating independent files for each tag ids

In [None]:
tag_ids = data['tag_id'].unique()
# Iterating over all tag_ids to generate separate files
for tag_id in tag_ids:
    tag_data = data[data['tag_id'] == tag_id]
    tag_data = tag_data.sort_values(by="time")
    tag_data.to_csv(f'{output_dir}/{tag_id}.txt', index=False)

Function to plot x, y coordinates for any tag_id on a particular date

In [None]:
def plot_trajectory(tag_id, date, data):
    filtered_data = data[(data['tag_id'] == tag_id) & (data['time'].dt.date == date)]
    fig,ax = plt.subplots(figsize=(12,6))
    plotDataGermanStore1(ax)
    ax.scatter(filtered_data['x'], filtered_data['y'], s = 0.5)
    ax.set_title(f'Trajectory for tag {tag_id} on {date}')
    plt.show()

In [None]:
# setting variable for random display of data
tag_id = "0x00206001289E"

from datetime import date
plot_trajectory(tag_id, date(2019,8,7), data)

In [None]:
def getTrajectory(traj_data):
    traj_data = traj_data.sort_values(by="time")
    fig,ax = plt.subplots(figsize=(12,6))
    plotDataGermanStore1(ax)
    
    start = traj_data.iloc[0]
    end = traj_data.iloc[-1]
    
    ax.plot(traj_data['x'], traj_data['y'])
    
    circle_start = plt.Circle((start["x"],start["y"]), 0.5, color="green")
    circle_end = plt.Circle((end["x"],end["y"]), 0.5, color="yellow")
    
    ax.add_patch(circle_start)
    ax.add_patch(circle_end)
    
    ax.set_title(f'Trajectory for tag {tag_id} on {traj_data["time"].iloc(0)[0].date()}')
    plt.show()

In [None]:
getTrajectory(data[(data['tag_id'] == tag_id) & (data['time'].dt.date == date(2019,8,7))])

Converting SVG file to PNG

In [None]:
!pip install svg.path

In [None]:
pip install cairosvg

In [None]:
from svg.path import parse_path
import cairosvg

def read_svg(svg_file):
    with open(svg_file, 'r') as f:
        svg_content = f.read()
    return svg_content

def convert_svg_to_image(svg_file, image_file):
    svg_content = read_svg(svg_file)
    cairosvg.svg2png(bytestring=svg_content, write_to=image_file)

svg_file = "/kaggle/input/indoor-store-trajectory-dataset/german_4/german_4.svg"
image_file = "/kaggle/working/german_4.png"
convert_svg_to_image(svg_file, image_file)

Trajectory Analysis for each tag_id for plotting the negative values and their neighborhood values to understand the regional distribution

In [None]:
'''
Remarks: For this data needs to be sorted by time.
'''
def trajectory_analysis(data, window_size):
    data = data.sort_values(by="time")
    # iterating over all the unique ids
    for row, tag_id in enumerate(data['tag_id'].unique()):
        # neighborhood points list
        prev_neighbourhood_coords = []
        next_neighbourhood_coords = []
        _data = data[data["tag_id"] == tag_id] #Bug: data was reinitialize with the filtered data
        # negative coordinates list
        negative_coords = []

        for i in range(len(_data)):
            if _data.iloc[i]["x"] < 0 or _data.iloc[i]["y"] < 0:
                negative_coords.append((_data.iloc[i]["x"], _data.iloc[i]["y"]))
                prev_neighbours = _data.iloc[i+1:min(i+window_size+1, len(_data))][["x", "y"]]
                next_neighbours = _data.iloc[max(0, i-window_size):i][["x", "y"]]
                prev_neighbourhood_coords.append(prev_neighbours.values.tolist())
                next_neighbourhood_coords.append(next_neighbours.values.tolist())

        prev_plotting_points = []
        next_plotting_points = []

        for x in prev_neighbourhood_coords:
            for y in x:
                prev_plotting_points.append(y)
        for x in next_neighbourhood_coords:
            for y in x:
                next_plotting_points.append(y)

        fig, ax = plt.subplots(1, 2, figsize= (30, 15))
        # points previous to current point becoming negative
        ax[0].scatter(_data['x'], _data['y'], c='blue', s = 0.5, label='All the points')
        plotDataGermanStore1(ax[0])
        ax[0].scatter([i[0] for i in prev_plotting_points], [i[1] for i in prev_plotting_points], c='red', label='Neighbourhood points')
        ax[0].scatter([i[0] for i in negative_coords], [i[1] for i in negative_coords], c='yellow', label='All negative points')
        ax[0].set_title(F'Previous neighbourhood points {tag_id} total points / negative points {len(_data), len(negative_coords)}')
        ax[0].grid()
        # points after to current point becoming negative
        ax[1].scatter(_data['x'], _data['y'], c='blue',s=0.5, label='All the points')
        plotDataGermanStore1(ax[1])
        ax[1].scatter([i[0] for i in next_plotting_points], [i[1] for i in next_plotting_points], c='red', label='Neighbourhood points')
        ax[1].scatter([i[0] for i in negative_coords], [i[1] for i in negative_coords], c='yellow', label='All negative points')
        ax[1].set_title(F'Next neighbourhood points {tag_id} total points / negative points {len(_data), len(negative_coords)}')
        ax[1].grid()
        
        plt.show()


In [None]:
%time trajectory_analysis(data, 2)

In [None]:
trajectory_analysis(data, 1)

Average consecutive area time spent in the negative area

What is the distribution of the intervals betweeen times when the trajectories come to an end point.

The typical are of end points/ start points lie between (x > 12 and y < 2.5) and (x < 2 and 2 < y < 7 or 12 < y < 18)

A velocity column helps us figure out if the cart/basket is standing or moving by setting the threshold (considering the noise) which is an experimental value.

In [None]:
from datetime import timedelta
# velocity threshold is threshold for the tag_id to be considered resting
# resting_threshold is the min waiting time that can be considered as resting taken as 1 second here
def find_resting_time(data, tag_id, velocity_threshold, resting_threshold):
    # Preprocessing the data
    data = data[data['tag_id'] == tag_id].copy() # .copy is used to solve the 'SettingWithCopyWarning' warning
    data['time'] = pd.to_datetime(data['time'], format='%Y-%m-%d %H:%M:%S')
    data = data.sort_values(by='time', ascending=True)
    # creating the velocity column and storing the values between the current and the previous data
    data['velocity'] = ((data['x'].diff() ** 2 + data['y'].diff() ** 2) ** 0.5) / data['time'].diff().dt.total_seconds()
    resting_periods = []
    start_time = None
    current_date = None
    for i in range(len(data)):
        # Change of date marks the end of the resting period
        if data.iloc[i]['time'].date() != current_date:
            if start_time is not None:
                end_time = data.iloc[i-1]['time']
                resting_periods.append((start_time, end_time))
            current_date = None
            start_time = None
        # if velocity less than the threshold either wait to end or set start time
        if data.iloc[i]['velocity'] < velocity_threshold:
            if start_time is None:
                start_time = data.iloc[i]['time']
                current_date = data.iloc[i]['time'].date()
        else: # if velocity is not less than threshold then mark this time as end_time
            if start_time is not None:
                end_time = data.iloc[i-1]['time']
                if (pd.to_datetime(end_time) - pd.to_datetime(start_time)).total_seconds() > resting_threshold:
                    resting_periods.append((start_time, end_time))
                start_time = None
    # finally if the last time is not ended then last data will be the end time
    if start_time is not None:
        end_time = data.iloc[-1]['time']
        resting_periods.append((start_time, end_time))
    return resting_periods

In [None]:
print(find_resting_time(data, tag_id, 0.2, 3))

Considering the neighbourbood points of the negative points as potential end points for trajectories.

In [None]:
import pandas as pd

def trajectory_data_generation(data):
    # Sort the dataset by time
    data = data.sort_values(by='time', ascending = True)
    
    # Creating a new empty dataset to store the positive points
    #new_data = pd.DataFrame(columns=['tag_id', 'time', 'x', 'y', 'trajectory_name'])
    types = ["start", "intermediate", "end"]
    
    new_data_list = []
    # Iterating over each tag_id
    for tag_id in tqdm(data['tag_id'].unique()):
        
        tag_data = data[data['tag_id'] == tag_id]
        
        # Assigning the first point as the starting point
        starting_point = (tag_data.iloc[0]['x'], tag_data.iloc[0]['y'])
        trajectory_name = "trajectory_1"
        state = 0
        new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[0]['time'], 'x': tag_data.iloc[0]['x'], 'y': tag_data.iloc[0]['y'], "description":tag_data.iloc[0]["description"], 'trajectory_name': trajectory_name, "point_type": types[state]})
        
        # Iterating over each data point for the current tag_id
        
        for i in tqdm(range(1, len(tag_data))):
            prev_point = (tag_data.iloc[i-1]['x'], tag_data.iloc[i-1]['y'])
            current_point = (tag_data.iloc[i]['x'], tag_data.iloc[i]['y'])
            
            # Check if the current point is negative
            if (current_point[0] < 0 or current_point[1] < 0) and starting_point != None:
                # Assign the prev_point as the end point and increase trajectory_name
                end_point = prev_point
                new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i-1]['time'], 'x': end_point[0], 'y': end_point[1], "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[2]})
                trajectory_name = "trajectory_" + str(int(trajectory_name.split("_")[1])+1)
                starting_point = None
            else:
                # Add the current point to the new dataset with trajectory_name column
                if starting_point != None:
                    new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i]['time'], 'x': current_point[0], 'y': current_point[1], "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[1]})
                elif current_point[0] > 0 and current_point[1] > 0:
                    starting_point = current_point
                    new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i]['time'], 'x': current_point[0], 'y': current_point[1], "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[0]})

                
    # Write the new dataset to a .txt file
    new_data = pd.DataFrame.from_dict(new_data_list)
    new_data.to_csv(path_new, sep=";", index=False)


In [None]:
trajectory_data_generation(data)

In [None]:
import pandas as pd

def trajectory_data_generation_without_resting(data):
    # Sort the dataset by time
    data = data.sort_values(by='time')
    
    # Create a new empty dataset to store the positive points
    #new_data = pd.DataFrame(columns=['tag_id', 'time', 'x', 'y', 'trajectory_name'])
    new_data_list = []
    types = ["start", "intermediate", "end"]
    
    # Iterate over each tag_id
    for tag_id in tqdm(data['tag_id'].unique()):
        # finding resting periods to iterate over and exclude the points
        resting_time = find_resting_time(data, tag_id, 0.5, 3)
        resting_periods = [(pd.to_datetime(start, format='%Y-%m-%d %H:%M:%S'), pd.to_datetime(end, format='%Y-%m-%d %H:%M:%S')) for start, end in resting_time]
        # Get all the data points for the current tag_id
        tag_data = data[data['tag_id'] == tag_id]
        
        # Assigning the first point as the starting point
        starting_point = (tag_data.iloc[0]['x'], tag_data.iloc[0]['y'])
        trajectory_name = "trajectory_1"
        state = 0
        new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[0]['time'], 'x': tag_data.iloc[0]['x'], 'y': tag_data.iloc[0]['y'], "description":tag_data.iloc[0]["description"], 
                              'trajectory_name': trajectory_name, "point_type": types[state]})
        
        # Iterating over each data point for the current tag_id
        start_loc = 0
        for i in tqdm(range(1, len(tag_data))):
            # Skips allows to skip over the current points if it lies in the resting intervals
            skip = False
            prev_point = (tag_data.iloc[i-1]['x'], tag_data.iloc[i-1]['y'])
            current_point = (tag_data.iloc[i]['x'], tag_data.iloc[i]['y'])
            for start, end in resting_periods[start_loc:]:
                # condition that current point lies in the interval
                if start <= pd.to_datetime(tag_data.iloc[i]['time']) and pd.to_datetime(tag_data.iloc[i]['time']) <= end:
                    skip = True
                    break
                # condition that point lies between the previous and the current interval
                elif pd.to_datetime(tag_data.iloc[i]['time']) < start:
                    break
                start_loc += 1 # start_loc allows to optimize the above process by not doing redundant iterations

            if skip:
                # if point lies in the interval and if there is a continued trajectory then mark the end point as previous point
                if starting_point != None:
                    end_point = prev_point
                    new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i-1]['time'], 'x': end_point[0], 'y': end_point[1], 
                                          "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[2]})
                    trajectory_name = "trajectory_" + str(int(trajectory_name.split("_")[1])+1)
                    starting_point = None
            else:
                # Checking if the current point is negative
                if (current_point[0] < 0 or current_point[1] < 0) and starting_point != None:
                    # Assigning the prev_point as the end point and increase trajectory_name
                    end_point = prev_point
                    new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i-1]['time'], 'x': end_point[0], 'y': end_point[1], 
                                          "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[2]})
                    trajectory_name = "trajectory_" + str(int(trajectory_name.split("_")[1])+1)
                    starting_point = None
                else:
                    # Adding the current point to the new dataset with trajectory_name column
                    if starting_point != None:
                        new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i]['time'], 'x': current_point[0], 'y': current_point[1], 
                                              "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[1]})
                    elif current_point[0] > 0 and current_point[1] > 0:
                        starting_point = current_point
                        new_data_list.append({'tag_id': tag_id, 'time': tag_data.iloc[i]['time'], 'x': current_point[0], 'y': current_point[1], 
                                              "description":tag_data.iloc[i]["description"], 'trajectory_name': trajectory_name, "point_type": types[0]})

                
    # Writing the new dataset
    new_data = pd.DataFrame.from_dict(new_data_list)
    new_data.to_csv(path_new_without_rest, sep=";", index=False)


In [None]:
data = pd.read_csv(path, sep=';')
trajectory_data_generation_without_resting(data)

Finding if there are loops in the trajectories

In [None]:
import pandas as pd
import numpy as np

# Load the data into a pandas dataframe
data = pd.read_csv(path_new, sep=';')

# Group the data by trajectory name
grouped = data.groupby('trajectory_name')

# Check if the distance between any two points is less than the threshold
threshold = 0.2

# Create a log file to store loop details
for tag_id in data["tag_id"].unique():
    with open(f"/content/gdrive/MyDrive/data/german_4/german_4/{tag_id}_log.txt", "w") as log_file:
        log_file.write("trajectory_name, tag_id, start_point, end_point, intermediate_point, total_interval\n")
        data = pd.read_csv(path_new, sep=';')
        data = data[data["tag_id"]==tag_id]
        # Group the data by trajectory name
        grouped = data.groupby('trajectory_name')

        for name, group in tqdm(grouped):
            loop_found = False
            for i in range(len(group) - 1):
                for j in range(i + 1, len(group)):
                    dist = np.sqrt((group.iloc[i]['x'] - group.iloc[j]['x'])**2 + (group.iloc[i]['y'] - group.iloc[j]['y'])**2)
                    if dist < threshold:
                        loop_found = True
                        start_point = f"({group.iloc[i]['x']}, {group.iloc[i]['y']})"
                        end_point = f"({group.iloc[j]['x']}, {group.iloc[j]['y']})"
                        intermediate_point = f"({group.iloc[i:j]['x'].mean()}, {group.iloc[i:j]['y'].mean()})"
                        total_interval = (pd.to_datetime(group.iloc[j]['time'], format='%Y-%m-%d %H:%M:%S') - pd.to_datetime(group.iloc[i]['time'], format='%Y-%m-%d %H:%M:%S')).total_seconds()
                        log_line = f"{name}, {group.iloc[i]['tag_id']}, {start_point}, {end_point}, {intermediate_point}, {total_interval}\n"
                        log_file.write(log_line)
                if loop_found:
                    break



Plotting the animations for the obtained trajectories

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython import display

def create_gif(tag_id, trajectory_name, path):
    data = pd.read_csv(path, sep=';')
    filtered_data = data[(data['tag_id'] == tag_id) & (data['trajectory_name'] == trajectory_name)]
    points = [(x,y) for x, y in zip(filtered_data['x'], filtered_data['y'])]
    x = [point[0] for point in points]
    y = [point[1] for point in points]

    fig, ax = plt.subplots(figsize=(6, 6))
    line, = ax.plot(x[0:1], y[0:1], color = 'grey' )
    dot, = ax.plot(x[0], y[0], color = 'black', marker = 'o' )

    ax.set_xlim([0,45])
    ax.set_xlabel('x', fontsize = 14)
    ax.set_ylim([0,45])
    ax.set_ylabel('y', fontsize = 14)
    ax.set_title(f'Relationship between x and y at step 0', fontsize=14)

    def update_frame(t):
        line.set_data(x[0:t+1], y[0:t+1])
        dot.set_data(x[t], y[t])
        ax.set_title(f'Relationship between x and y at step {t}', fontsize=14)
        return line, dot

    time = np.arange(len(x))
    anim = animation.FuncAnimation(fig, update_frame, frames=time, interval=500)
    anim.save(f'{tag_id}_{trajectory_name}.gif', writer='pillow')
    plt.show()


In [None]:
tag_id = "0x002060002016"
trajectory_name = "trajectory_1"
create_gif(tag_id, trajectory_name, path_new)

In [None]:
with open(f'{tag_id}_{trajectory_name}.gif','rb') as f:
        display.Image(data=f.read(), format='png')

In [None]:
tag_id = "0x002060002016"
trajectory_name = "trajectory_1"
create_gif(tag_id, trajectory_name, path_new_without_rest)

In [None]:
with open(f'{tag_id}_{trajectory_name}.gif','rb') as f:
        display.Image(data=f.read(), format='png')

Conclusion: The trajectories that exclude the resting time are shorter then the trajectories picked from the neighbourhood of the negative points, although the threshold values in the "trajectory_data_generation_without_resting" can be changed to make the trajectories longer.