# Generating a training set based on extracted ant data

Load pickled data files for ant positions, trajectories, distances, etc.

At this moment, I'm uncertain what amount of time each sequence should be set to -- should it be fixed across all sequences in the training set? Or should these be different? It will help the network to learn how to generate dynamics but I need to read up on this part.


In [82]:
# Imports
import numpy as np
import lzma, pickle
import os

from collections import namedtuple

# Plotting/output
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_theme()

sns.set_style(style='white')

# ---------------------------

# Progress bar
from IPython.display import HTML, display
import time

def ProgressIter(iter_fun, iter_len=0):
  if not iter_len:
    iter_len = len(iter_fun)
  out = display(progress(0, iter_len), display_id=True)
  for i, it in enumerate(iter_fun):
    yield it
    out.update(progress(i + 1, iter_len))

def progress(value, max):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 45%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

## Converting the ant data format



In [83]:
# Read in the pickled data produced from the generate_data.ipynb.
import lzma, pickle

source_dir = '../../data/2023_2/'
input_file = 'KA050_10cm_5h_20230614_1h-2h.pkl'
with open(os.path.join(source_dir, input_file), 'rb') as file:
    ant_data = pd.DataFrame(pickle.load(file))

print(ant_data.head())

output_filename = '.'.join(input_file.split('.')[0:-1]) + '.pkl.xz'
print(output_filename)
if not os.path.exists(os.path.join(source_dir, output_filename)):
    ant_data.to_pickle(os.path.join(source_dir, output_filename), compression='xz')
    

# 216000 entries, 60 fps * 60 seconds * 60 minutes = 1 hour.
# for i in range(0, len(ant_data.T), 2):
#     for j, k in enumerate(zip(ant_data[i].x, ant_data[i].y)):
#         print(i, j, k)
#     break

      0             1             2             3              4          ...  \
       x      y      x      y      x      y      x       y      x      y  ...   
0  272.0  332.0  490.0  835.0  472.0  867.0  447.0  1053.0  428.0  240.0  ...   
1  272.0  332.0  491.0  836.0  471.0  867.0  447.0  1053.0  428.0  240.0  ...   
2  271.0  332.0  491.0  837.0  471.0  867.0  448.0  1052.0  428.0  240.0  ...   
3  271.0  331.0  491.0  837.0  470.0  867.0  447.0  1052.0  428.0  240.0  ...   
4  271.0  331.0  492.0  837.0  470.0  867.0  447.0  1052.0  428.0  240.0  ...   

      49          50      51      52      53      
       x       y   x   y   x   y   x   y   x   y  
0  632.0  1216.0 NaN NaN NaN NaN NaN NaN NaN NaN  
1  631.0  1216.0 NaN NaN NaN NaN NaN NaN NaN NaN  
2  632.0  1216.0 NaN NaN NaN NaN NaN NaN NaN NaN  
3  630.0  1216.0 NaN NaN NaN NaN NaN NaN NaN NaN  
4  630.0  1216.0 NaN NaN NaN NaN NaN NaN NaN NaN  

[5 rows x 108 columns]
KA050_10cm_5h_20230614_1h-2h.pkl.xz


## Importing the ant data

In [84]:
import lzma, pickle

source_dir = '../../data/2023_2/'
input_files = []

for file in os.listdir(source_dir):
    if file.endswith('.pkl.xz'):
        input_files.append(file)

print(input_files)
ant_data = []

for input_file in input_files:
    with lzma.open(os.path.join(source_dir, input_file)) as file:
        ant_data.append(pd.read_pickle(file))

ant_data = pd.concat(ant_data, ignore_index=True)


['KA050_10cm_5h_20230614_1h-2h.pkl.xz', 'KA050_10cm_5h_20230614_4h-5h.pkl.xz', 'KA050_10cm_5h_20230614_3h-4h.pkl.xz', 'KA050_10cm_5h_20230614_2h-3h.pkl.xz']


In [1]:
def euclidean_distances(data):
    a = np.array(data)
    b = a.reshape(a.shape[0], 1, a.shape[1])
    distances = np.sqrt(np.einsum('ijk, ijk->ij', a-b, a-b))
    np.fill_diagonal(distances, np.NaN)

    return distances

# print(ant_data.T[0])
# print(len(ant_data.T[0].values))
# print(ant_data[[col for col in ant_data.columns if 'x' in col]].values[0])
# print(ant_data[[col for col in ant_data.columns if 'x' in col]][0:100].values[0])

distance_data = []
for i in range(0, len(ant_data), 10000):
    distance_data.append(euclidean_distances(
        list(zip(
            ant_data[[col for col in ant_data.columns if 'x' in col]].values[i],
            ant_data[[col for col in ant_data.columns if 'y' in col]].values[i]
        ))
    ))
print(np.array(distance_data[0]))

print(distance_data.shape())

NameError: name 'ant_data' is not defined

In [86]:
# Finding columns with at least one NaN value in either 'x' or 'y' subcolumns
columns_with_nan = [col for col in ant_data.columns.levels[0] if ant_data[col].isna().any().any()]

print(len(ant_data.columns)/2)
print(len(columns_with_nan))

57.0
50


## Find the (x, y) coordinate bounds for the ants within the video space

Find where the ants are able to explore within pixel space, finding
(x-min, y-min, x-max, y-max)

In [87]:
ant_data.head()

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,52,52,53,53,54,54,55,55,56,56
Unnamed: 0_level_1,x,y,x,y,x,y,x,y,x,y,...,x,y,x,y,x,y,x,y,x,y
0,272.0,332.0,490.0,835.0,472.0,867.0,447.0,1053.0,428.0,240.0,...,,,,,,,,,,
1,272.0,332.0,491.0,836.0,471.0,867.0,447.0,1053.0,428.0,240.0,...,,,,,,,,,,
2,271.0,332.0,491.0,837.0,471.0,867.0,448.0,1052.0,428.0,240.0,...,,,,,,,,,,
3,271.0,331.0,491.0,837.0,470.0,867.0,447.0,1052.0,428.0,240.0,...,,,,,,,,,,
4,271.0,331.0,492.0,837.0,470.0,867.0,447.0,1052.0,428.0,240.0,...,,,,,,,,,,


In [123]:
# Concatenating all x and y values into separate Series
all_x_values = ant_data[[col for col in ant_data.columns if 'x' in col]]
all_y_values = ant_data[[col for col in ant_data.columns if 'y' in col]]
# Calculating the minimum and maximum for x and y values efficiently
min_x = all_x_values.min(axis=None)
max_x = all_x_values.max(axis=None)
min_y = all_y_values.min(axis=None)
max_y = all_y_values.max(axis=None)

min_x, min_y, max_x, max_y

(42.0, 54.0, 1243.0, 1229.0)

Find the circular area contained within the bounding box given by (min(x,y), max(x, y))

In [89]:
def calculate_circle(min_x, min_y, max_x, max_y):
    """
    Calculate the circle that fits perfectly within a bounding box.

    Parameters:
    min_x (float): The minimum x value of the bounding box.
    max_x (float): The maximum x value of the bounding box.
    min_y (float): The minimum y value of the bounding box.
    max_y (float): The maximum y value of the bounding box.

    Returns:
    tuple: A tuple containing the center coordinates (x, y) and the radius of the circle.
    """
    # Calculate the center of the bounding box
    x_centre = (min_x + max_x) / 2
    y_centre = (min_y + max_y) / 2

    # Calculate the radius of the circle
    radius = min(max_x - min_x, max_y - min_y) / 2

    return ((x_centre, y_centre), radius)

centre, radius = calculate_circle(min_x, min_y, max_x, max_y)
print(centre, radius)

(642.5, 641.5) 587.5


In [90]:
def circle_transformation(circle_a, circle_b):
    """
    Calculate the transformation from one circle to another.

    Parameters:
    circle_a (tuple): A tuple (x_a, y_a, r_a) representing Circle A's center and radius.
    circle_b (tuple): A tuple (x_b, y_b, r_b) representing Circle B's center and radius.

    Returns:
    tuple: A tuple containing the translation vector (dx, dy) and the scaling factor.
    """
    (x_a, y_a), r_a = circle_a
    (x_b, y_b), r_b = circle_b

    # Translation vector
    dx = x_b - x_a
    dy = y_b - y_a

    # Scaling factor
    scale = r_b / r_a

    return (dx, dy), scale

circle_a = ((642.5, 641.5), 587.5)
circle_b = ((450.0, 450.0), 432.0)

translation, scaling = circle_transformation(circle_a, circle_b)
print(f"Translation: {translation}, Scaling: {scaling}")

circle_c = ((circle_a[0][0] + translation[0], circle_a[0][1] + translation[1]), circle_a[1] * scaling)
print(circle_c)


Translation: (-192.5, -191.5), Scaling: 0.7353191489361702
((450.0, 450.0), 432.0)


## Create the data set

What is the best way to do this?
- Chunk it up ahead of time?
- What is the suitable length of time for each trajectory?
- Should we categorise/distribute the trajectories based on motion?
  - Total distance from start
  - Total distance travelled
  - Time spent moving


In [91]:
print(ant_data[2])
print(len(ant_data.T)/2)

            x      y
0       472.0  867.0
1       471.0  867.0
2       471.0  867.0
3       470.0  867.0
4       470.0  867.0
...       ...    ...
863998    NaN    NaN
863999    NaN    NaN
864000    NaN    NaN
864001    NaN    NaN
864002    NaN    NaN

[864003 rows x 2 columns]
57.0


In [92]:
num_trails = 10_000
trail_length = 60 * 60
s = np.zeros((num_trails, trail_length, 2), dtype=float)

for i in range(num_trails):
    start = np.random.randint(0, len(ant_data[0]) - trail_length)
    ant_index = np.random.randint(0, len(ant_data.columns.levels[0]))
    not_null = False
    while not not_null:
        ant_index = np.random.randint(0, len(ant_data.columns.levels[0]))
        if np.isnan(np.array(ant_data[ant_index][start:start + trail_length])).any():
            continue
        else:
            s[i][0:trail_length] = ant_data[ant_index][start:start + trail_length]
            not_null = True
