In [3]:
from process_CSL import * 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 

## Representation density plotting
1. Load the file labels 
  - Check the number of glosses and compare the number of frames 
  - divide the frames accordingly and label them according to that gloss in a dictionary

2. Load the embeddings and perform PCA or t-SNE 
  - Can choose to do so in samples sizes and then plot them accordingly

In [4]:
'''
Loading the labels
'''

def read_CSL_annotations(CSL_annot_path):
    with open(CSL_annot_path, 'rb') as f:
        data = pickle.load(f)
    return data

## Reading in the labelled annotations
train_labels = read_CSL_annotations("../../CSL-Daily/sentence_label/processed/labels_train.pkl") 
dev_labels = read_CSL_annotations("../../CSL-Daily/sentence_label/processed/labels_dev.pkl")
test_labels = read_CSL_annotations("../../CSL-Daily/sentence_label/processed/labels_test.pkl")
combined_labels = read_CSL_annotations("../../CSL-Daily/sentence_label/csl2020ct_v2.pkl")

In [5]:
train_labels

{'info': [{'name': 'S000000_P0000_T00', 'translation': '你们好！', 'length': 52},
  {'name': 'S000000_P0004_T00', 'translation': '你们好！', 'length': 47},
  {'name': 'S000000_P0008_T00', 'translation': '你们好！', 'length': 58},
  {'name': 'S000001_P0000_T00', 'translation': '对不起！', 'length': 37},
  {'name': 'S000001_P0004_T00', 'translation': '对不起！', 'length': 33},
  {'name': 'S000001_P0008_T00', 'translation': '对不起！', 'length': 45},
  {'name': 'S000002_P0000_T00', 'translation': '没关系！', 'length': 29},
  {'name': 'S000002_P0004_T00', 'translation': '没关系！', 'length': 35},
  {'name': 'S000002_P0008_T00', 'translation': '没关系！', 'length': 49},
  {'name': 'S000003_P0000_T00', 'translation': '谢谢！', 'length': 30},
  {'name': 'S000003_P0004_T00', 'translation': '谢谢！', 'length': 32},
  {'name': 'S000003_P0008_T00', 'translation': '谢谢！', 'length': 35},
  {'name': 'S000004_P0000_T00', 'translation': '不客气！', 'length': 51},
  {'name': 'S000004_P0004_T00', 'translation': '不客气！', 'length': 41},
  {'name': 'S00

In [6]:
dev_labels

{'info': [{'name': 'S000020_P0000_T00', 'translation': '他今年四岁。', 'length': 54},
  {'name': 'S000020_P0008_T00', 'translation': '他今年四岁。', 'length': 90},
  {'name': 'S000040_P0000_T00', 'translation': '今天星期几？', 'length': 41},
  {'name': 'S000040_P0004_T00', 'translation': '今天星期几？', 'length': 42},
  {'name': 'S000054_P0000_T00', 'translation': '今天我想吃面条。', 'length': 56},
  {'name': 'S000153_P0004_T00', 'translation': '你和小张什么时候认识的？', 'length': 55},
  {'name': 'S000153_P0008_T00', 'translation': '你和小张什么时候认识的？', 'length': 109},
  {'name': 'S000185_P0000_T00', 'translation': '我要去超市买椅子，你去吗？', 'length': 91},
  {'name': 'S000195_P0000_T00', 'translation': '他们下午要做什么？', 'length': 49},
  {'name': 'S000195_P0008_T00', 'translation': '他们下午要做什么？', 'length': 84},
  {'name': 'S000196_P0000_T00', 'translation': '他们想什么时候去买椅子？', 'length': 79},
  {'name': 'S000196_P0004_T00', 'translation': '他们想什么时候去买椅子？', 'length': 90},
  {'name': 'S000201_P0004_T00', 'translation': '我每天六点起床。', 'length': 134},
  {'name': 'S

In [7]:
test_labels

{'info': [{'name': 'S000020_P0004_T00', 'translation': '他今年四岁。', 'length': 52},
  {'name': 'S000040_P0008_T00', 'translation': '今天星期几？', 'length': 66},
  {'name': 'S000054_P0008_T00', 'translation': '今天我想吃面条。', 'length': 93},
  {'name': 'S000153_P0000_T00', 'translation': '你和小张什么时候认识的？', 'length': 76},
  {'name': 'S000185_P0004_T00', 'translation': '我要去超市买椅子，你去吗？', 'length': 108},
  {'name': 'S000185_P0008_T00', 'translation': '我要去超市买椅子，你去吗？', 'length': 161},
  {'name': 'S000195_P0004_T00', 'translation': '他们下午要做什么？', 'length': 51},
  {'name': 'S000196_P0008_T00', 'translation': '他们想什么时候去买椅子？', 'length': 124},
  {'name': 'S000201_P0000_T00', 'translation': '我每天六点起床。', 'length': 79},
  {'name': 'S000213_P0004_T00', 'translation': '他每天回来都很累。', 'length': 71},
  {'name': 'S000213_P0008_T00', 'translation': '他每天回来都很累。', 'length': 115},
  {'name': 'S000218_P0004_T00', 'translation': '这块手表是你的吗？', 'length': 31},
  {'name': 'S000218_P0008_T00', 'translation': '这块手表是你的吗？', 'length': 60},
  {'nam

In [8]:
combined_labels

{'info': [{'name': 'S000000_P0000_T00',
   'length': 52,
   'label_gloss': ['你们', '好'],
   'label_char': ['你', '们', '好', '！'],
   'label_word': ['你们', '好', '！'],
   'label_postag': ['r', 'a', 'w'],
   'signer': 0,
   'time': 0},
  {'name': 'S000000_P0004_T00',
   'length': 47,
   'label_gloss': ['你们', '好'],
   'label_char': ['你', '们', '好', '！'],
   'label_word': ['你们', '好', '！'],
   'label_postag': ['r', 'a', 'w'],
   'signer': 4,
   'time': 0},
  {'name': 'S000000_P0008_T00',
   'length': 58,
   'label_gloss': ['你们', '好'],
   'label_char': ['你', '们', '好', '！'],
   'label_word': ['你们', '好', '！'],
   'label_postag': ['r', 'a', 'w'],
   'signer': 8,
   'time': 0},
  {'name': 'S000001_P0000_T00',
   'length': 37,
   'label_gloss': ['对不起'],
   'label_char': ['对', '不', '起', '！'],
   'label_word': ['对不起', '！'],
   'label_postag': ['v', 'w'],
   'signer': 0,
   'time': 0},
  {'name': 'S000001_P0004_T00',
   'length': 33,
   'label_gloss': ['对不起'],
   'label_char': ['对', '不', '起', '！'],
   'la

## Matching frames to gloss function

In [9]:
'''
Function to gather all the tensor .pt files from a specific folder name
1. Load the file labels 
  - Check the number of glosses and compare the number of frames 
  - divide the frames accordingly and label them according to that gloss in a dictionary
'''
def gather_vid_emb(name, phase, img_dir = "../../CSL-Daily/sentence/frames_512x512"): 
  vid_folder = os.path.join(img_dir, f"{phase}/{name}")
  print("Getting video frames from ", vid_folder)

  #list all the files in the folder
  path_lst = os.listdir(vid_folder)
  # Keep those who are only .pt and sort them
  path_lst = sorted([f for f in path_lst if f.endswith('.pt')])
  print(path_lst)
  # open all the files and keep the tensors in a list
  tensor_lst = [torch.load(os.path.join(vid_folder, f)) for f in path_lst]
  return tensor_lst

def get_gloss(combined_annotations, name): 
  # find entry in combined_annotations 
  entry = find_entry_by_name(combined_labels, name)
  gloss_entry = entry['label_gloss']
  return gloss_entry

def match_frames_w_gloss(tensor_lst, gloss): 
  # divide the frames into glosses
  num_tensors = len(tensor_lst)
  num_gloss = len(gloss)
  tensors_per_gloss = num_tensors//num_gloss

  # create a dictionary to hold the frames referring to a gloss
  gloss_dict = {}
  for i, g in enumerate(gloss):
    if i == num_gloss-1: 
      gloss_dict[g] = [tensor_lst[i*tensors_per_gloss:]]
    else: 
      gloss_dict[g] = [tensor_lst[i*tensors_per_gloss:(i+1)*tensors_per_gloss]]
  
  return gloss_dict
  
def find_and_combine_glossdicts(phase, num_samples = None , name_lst= None , dir="../../CSL-Daily/sentence/frames_512x512"):
  gloss_dict_lst = {}
  ## make assertions to prevent error 
  assert phase in ['train', 'dev', 'test'], "Phase must be either train, dev or test"
  assert num_samples is not None or name_lst is not None, "Either number of samples or name list must be provided"

  if name_lst is None:
    video_lst = os.listdir(f"{dir}/{phase}")
    # Sample random number of videos
    name_lst =  random.sample(video_lst, num_samples)

  ## Gather various gloss dicts 
  for name in name_lst: 
    gloss = get_gloss(combined_labels, name)
    tensor_lst = gather_vid_emb(name, phase)
    gloss_dict_lst[name] = match_frames_w_gloss(tensor_lst, gloss)

  # Comebine gloss dicts 
  combined_gloss_dict = {}
  for name in name_lst: 
    for k, v in gloss_dict_lst[name].items(): 
      if k in combined_gloss_dict: 
        combined_gloss_dict[k].extend(v)
      else: 
        combined_gloss_dict[k] = v
  return gloss_dict_lst, combined_gloss_dict



# Dimensionality reduction and plotting functions
2. Load the embeddings and perform PCA or t-SNE 
  - Can choose to do so in samples sizes and then plot them accordingly

In [10]:
from sklearn.decomposition import PCA

def perform_pca(data, n_components=2):
    """
    Perform PCA on the given data.
    
    Parameters:
    data (array-like): The input data to perform PCA on.
    n_components (int): The number of principal components to compute.
    
    Returns:
    transformed_data (array-like): The data transformed into the principal component space.
    pca (PCA object): The fitted PCA object.
    """
    pca = PCA(n_components=n_components)
    transformed_data = pca.fit_transform(data)
    return transformed_data, pca

# Example usage:
# Assuming `data` is your input data in the form of a numpy array or pandas DataFrame
# transformed_data, pca = perform_pca(data)