In [None]:
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/stx17.tar.gz stx17.tar.gz
!mkdir -p stx
!tar -xzf stx17.tar.gz -C stx/

In [None]:
# find all npz files in the stx folder
import glob
trajectory_files = glob.glob('stx/stx/*.npz')
print('Found %d trajectory files' % len(trajectory_files))

Found 3696 trajectory files


In [None]:
from tqdm import tqdm

In [None]:
def split_into_chunks(array, n=4):
    chunk_size = len(array) // n
    return [array[i * chunk_size:(i + 1) * chunk_size] for i in range(n)]

In [None]:
import numpy as np

def fill_nan_with_avg(arr_ds):
    for first_index in range(arr_ds.shape[0]):
        for second_index in range(arr_ds.shape[1]):
            arr = arr_ds[first_index, second_index, :]
            nans = np.isnan(arr)
            for i in range(len(arr)):
                if nans[i]:
                    left = right = np.nan
                    # Find nearest non-NaN value to the left
                    for j in range(i-1, -1, -1):
                        if not nans[j]:
                            left = arr[j]
                            break

                    # Find nearest non-NaN value to the right
                    for j in range(i+1, len(arr)):
                        if not nans[j]:
                            right = arr[j]
                            break

                    # Calculate average if both neighbors are found
                    if not np.isnan(left) and not np.isnan(right):
                        arr[i] = (left + right) / 2
                    # If only one neighbor, use its value
                    elif not np.isnan(left):
                        arr[i] = left
                    elif not np.isnan(right):
                        arr[i] = right
            arr_ds[first_index, second_index, :] = arr
    return arr_ds


In [None]:
trajectory_chunks = split_into_chunks(trajectory_files, 20)
trajectory = [np.empty((0,2,2000)) for i in range(20)]

with open('mds_catalogue.csv', 'w') as catalogue_file:
  for chunk_id, chunk in enumerate(trajectory_chunks):
    print('Chunk ', chunk_id)
    for file_name in tqdm(chunk):
      f = np.load(file_name, allow_pickle=True)['arr_0']
      catalogue_file.write(file_name + ',' + str(f.shape[0]) + '\n')
      trajectory[chunk_id] = np.vstack((trajectory[chunk_id], f))
    print('Casting...')
    trajectory[chunk_id] = trajectory[chunk_id].astype(np.float32)
    trajectory[chunk_id] = fill_nan_with_avg(trajectory[chunk_id])
  # print('Trajectory shape:', trajectory.shape)

Chunk  0


100%|██████████| 184/184 [00:06<00:00, 26.83it/s]


Casting...
Chunk  1


100%|██████████| 184/184 [00:04<00:00, 40.34it/s]


Casting...
Chunk  2


100%|██████████| 184/184 [00:04<00:00, 42.73it/s]


Casting...
Chunk  3


100%|██████████| 184/184 [00:04<00:00, 43.43it/s]


Casting...
Chunk  4


100%|██████████| 184/184 [00:03<00:00, 46.84it/s]


Casting...
Chunk  5


100%|██████████| 184/184 [00:04<00:00, 39.31it/s]


Casting...
Chunk  6


100%|██████████| 184/184 [00:04<00:00, 38.96it/s]


Casting...
Chunk  7


100%|██████████| 184/184 [00:04<00:00, 38.22it/s]


Casting...
Chunk  8


100%|██████████| 184/184 [00:04<00:00, 40.49it/s]


Casting...
Chunk  9


100%|██████████| 184/184 [00:05<00:00, 35.82it/s]


Casting...
Chunk  10


100%|██████████| 184/184 [00:05<00:00, 35.51it/s]


Casting...
Chunk  11


100%|██████████| 184/184 [00:04<00:00, 41.94it/s]


Casting...
Chunk  12


100%|██████████| 184/184 [00:05<00:00, 32.16it/s]


Casting...
Chunk  13


100%|██████████| 184/184 [00:05<00:00, 31.91it/s]


Casting...
Chunk  14


100%|██████████| 184/184 [00:04<00:00, 37.83it/s]


Casting...
Chunk  15


100%|██████████| 184/184 [00:05<00:00, 36.69it/s]


Casting...
Chunk  16


100%|██████████| 184/184 [00:05<00:00, 35.20it/s]


Casting...
Chunk  17


100%|██████████| 184/184 [00:04<00:00, 42.96it/s]


Casting...
Chunk  18


100%|██████████| 184/184 [00:04<00:00, 40.84it/s]


Casting...
Chunk  19


100%|██████████| 184/184 [00:05<00:00, 35.02it/s]


Casting...


In [None]:
N_trajectories = np.sum(np.array([x.shape[0] for x in trajectory]))
print('There are {} trajectories'.format(N_trajectories))

There are 19113 trajectories


In [None]:
big_trajectory = np.concatenate(trajectory)

In [None]:
big_trajectory.shape

(19113, 2, 2000)

In [None]:
# Compte the distance matrix
dist = euclidean_distances(big_trajectory.reshape(big_trajectory.shape[0], -1))

In [None]:
# Compute the MDS
mds = MDS(n_components=2, dissimilarity='precomputed')
pos = mds.fit_transform(dist)



In [None]:
np.savez_compressed('mds', pos)

In [None]:
!cp mds_catalogue.csv /content/drive/MyDrive/mds_catalogue.csv
!cp mds.npz /content/drive/MyDrive/mds_pos.npz