In [1]:
import os
import tensorflow as tf
import numpy as np

import constants
from datasets import DependencyDataset
from network import DistanceProbe, DepthProbe

In [129]:
def orthogonality_check(matrix):
    assert len(matrix.shape) == 2, "Only 2D matrices suported"
    fro_norm = np.linalg.norm(matrix @ matrix.transpose() -  np.eye(matrix.shape[0]), ord='fro')
    if matrix.shape[0] != matrix.shape[1]:
        fro_norm += np.linalg.norm(matrix.transpose() @ matrix - np.eye(matrix.shape[1]), ord='fro')
        fro_norm /= 2.
    return fro_norm


def matrix_cos_distance(matrixA, matrixB):
    EPSILON = 1e-4
    assert np.linalg.norm(matrixA) > EPSILON
    assert np.linalg.norm(matrixB) > EPSILON
    return np.linalg.norm(matrixA - matrixB, ord='fro') / (np.linalg.norm(matrixA, ord='fro') * np.linalg.norm(matrixB, ord='fro'))

def matrix_max_distance(matrixA, matrixB):
    return np.max(np.abs(matrixA - matrixB))

def matrix_smpl_distance(matrixA, matrixB):
    NUM_SMPL = 100
    sample = np.random.randn(NUM_SMPL, matrixA.shape[0])
    sample = sample / np.linalg.norm(sample, axis=-1, keepdims=True)
    A_smpl = sample @ matrixA
    B_smpl = sample @ matrixB
    return np.mean(np.linalg.norm(A_smpl - B_smpl, axis=-1))
    

# DEPTH MATRICES

In [4]:
model_dim = 768
probe_rank = 768

languages = ['en', 'fr', 'de']
out_dir = 'experiments/train_en_de_fr/task_depth-layer_6-trainl_en_de_fr/'


DistanceProbe = tf.Variable(tf.random_uniform_initializer(minval=-0.05, maxval=0.05, seed=42)
                             ((probe_rank, model_dim)),
                             trainable=True, name='distance_probe', dtype=tf.float32)

DepthProbe = tf.Variable(tf.random_uniform_initializer(minval=-0.05, maxval=0.05, seed=42)
                             ((probe_rank, model_dim)),
                             trainable=True, name='depth_probe', dtype=tf.float32)

optimizer=tf.optimizers.Adam()

LanguageMaps = {lang: tf.Variable(tf.initializers.Identity(gain=1.0)((model_dim, model_dim)),
                                               trainable=False, name='{}_map'.format(lang))
                             for lang in languages}
ckpt = tf.train.Checkpoint(optimizer=optimizer, depth_probe=DepthProbe, distance_probe=DistanceProbe, **LanguageMaps)

checkpoint_manger = tf.train.CheckpointManager(ckpt, os.path.join(out_dir, 'params'), max_to_keep=1)

checkpoint_manger.restore_or_initialize()

'experiments/train_en_de_fr/task_depth-layer_6-trainl_en_de_fr/params/ckpt-12'

In [5]:
en_map_depth = ckpt.en.numpy()
fr_map_depth = ckpt.fr.numpy()
de_map_depth  = ckpt.de.numpy()
depth_probe = ckpt.depth_probe.numpy()

## Conduct SVD and compare non orthogonal part of the matrix

In [6]:
orthogonality_check(en_map_depth)

15.058353652093482

In [7]:
orthogonality_check(de_map_depth)

15.259980829491433

In [8]:
orthogonality_check(fr_map_depth)

15.843890662679716

depth probe matrix and language map matrices are not orthogonal

In [145]:
en_u, en_s, en_v =  np.linalg.svd(en_map_depth)
de_u, de_s, de_v =  np.linalg.svd(de_map_depth)
fr_u, fr_s, fr_v =  np.linalg.svd(fr_map_depth)
depth_u, depth_s, depth_v =  np.linalg.svd(depth_probe)

# en_sv = np.diag(en_s) @ en_v
# de_sv = np.diag(de_s) @ de_v
# fr_sv = np.diag(fr_s) @ fr_v

In [103]:
np.linalg.norm(en_s)

26.680162

In [142]:
np.mean(en_s)

0.93292904

In [139]:
np.linalg.norm(depth_s)

2.699213

Most of scaling is done in language specific matrix. It hinders multilanguage aspect of the probe.
On the other hand depth probe is responsible for downscaling, mean eigenvalue of lang specific matrix is close to one. I think it is mostly due to intialization.

In [146]:
np.linalg.norm(en_s - de_s)

0.7724513

In [147]:
np.linalg.norm(en_s - fr_s)

0.39260736

In [148]:
np.linalg.norm(de_s - fr_s)

0.6916721

Takeway: Language specific matrices have similar singular values

In [107]:
np.mean(np.max(en_v @ fr_v.transpose(), axis=-1) / np.max(en_u.transpose() @ fr_u, axis=0))

1.0238862

In [108]:
np.mean(np.max(en_v @ de_v.transpose(), axis=-1) / np.max(en_u.transpose() @ de_u, axis=0))

1.0285408

In [109]:
np.mean(np.max(fr_v @ de_v.transpose(), axis=-1) / np.max(fr_u.transpose() @ de_u, axis=0))

1.0191784

?Takeway: right multiplication by U makes vector space less language dependent (where U @ S @ V = SVD(M))

# DISTANCE MATRICES

In [95]:
model_dim = 768
probe_rank = 768

languages = ['en', 'fr', 'de']
out_dir = 'experiments/train_en_de_fr/task_distance-layer_6-trainl_en_de_fr/'


DistanceProbe = tf.Variable(tf.random_uniform_initializer(minval=-0.05, maxval=0.05, seed=42)
                             ((probe_rank, model_dim)),
                             trainable=True, name='distance_probe', dtype=tf.float32)

DepthProbe = tf.Variable(tf.random_uniform_initializer(minval=-0.05, maxval=0.05, seed=42)
                             ((probe_rank, model_dim)),
                             trainable=True, name='depth_probe', dtype=tf.float32)

optimizer=tf.optimizers.Adam()

LanguageMaps = {lang: tf.Variable(tf.initializers.Identity(gain=1.0)((model_dim, model_dim)),
                                               trainable=False, name='{}_map'.format(lang))
                             for lang in languages}
ckpt = tf.train.Checkpoint(optimizer=optimizer, depth_probe=DepthProbe, distance_probe=DistanceProbe, **LanguageMaps)

checkpoint_manger = tf.train.CheckpointManager(ckpt, os.path.join(out_dir, 'params'), max_to_keep=1)

checkpoint_manger.restore_or_initialize()

'experiments/train_en_de_fr/task_distance-layer_6-trainl_en_de_fr/params/ckpt-15'

In [110]:
en_map_dist = ckpt.en.numpy()
fr_map_dist = ckpt.fr.numpy()
de_map_dist = ckpt.de.numpy()
distance_probe = ckpt.distance_probe.numpy()

In [111]:
en_u, en_s, en_v =  np.linalg.svd(en_map_dist)
de_u, de_s, de_v =  np.linalg.svd(de_map_dist)
fr_u, fr_s, fr_v =  np.linalg.svd(fr_map_dist)
dist_u, dist_s, dist_v =  np.linalg.svd(distance_probe)

In [112]:
orthogonality_check(en_map_dist)

18.396372620066515

In [113]:
np.linalg.norm(en_s)

27.887388

In [138]:
np.linalg.norm(dist_s)

2.6431499

In [136]:
np.linalg.norm(dist_s)

2.6431499

In [120]:
np.linalg.norm(en_s - de_s)

0.5069455

In [119]:
np.linalg.norm(en_s - fr_s)

0.21521693

In [118]:
np.linalg.norm(de_s - fr_s)

0.5700357

Observation: eigenvalues of maps across languages are closer in distance probing than depth probing

In [121]:
np.mean(np.max(en_v @ fr_v.transpose(), axis=-1) / np.max(en_u.transpose() @ fr_u, axis=0))

1.0151405

In [122]:
np.mean(np.max(en_v @ de_v.transpose(), axis=-1) / np.max(en_u.transpose() @ de_u, axis=0))

1.0247933

In [123]:
np.mean(np.max(fr_v @ de_v.transpose(), axis=-1) / np.max(fr_u.transpose() @ de_u, axis=0))

1.017285

# Comprison of Language specific matrices in Depth and Distance probes

In [124]:
matrix_cos_distance(en_map_depth, en_map_dist)

0.012554613

In [125]:
matrix_cos_distance(fr_map_depth, fr_map_dist)

0.012928756

In [126]:
matrix_cos_distance(de_map_depth, de_map_dist)

0.012441403

In [127]:
matrix_cos_distance(distance_probe, depth_probe)

0.52868587

In [141]:
np.linalg.norm(en_map_depth - en_map_dist, ord='fro')

9.3411

In [140]:
np.linalg.norm(distance_probe - depth_probe, ord='fro')

3.7718587

In [130]:
matrix_smpl_distance(en_map_depth, en_map_dist)

0.3365877558739269

In [134]:
matrix_max_distance(en_map_depth, en_map_dist)

0.096884206

In [135]:
matrix_max_distance(distance_probe, depth_probe)

0.15160161