In [11]:
import os
import re
import data_strings 
from collections import defaultdict

#### Functions to extract filepaths for social and solo sessions from a root directory. If match_order_solos is also used (see umbrella function), solo sessions will be combined in a specific order. These functions can also be used to find all files with a specific substring in the filename.

In [12]:
def get_relative_paths(match_string, data_folder=data_strings.DATA_FOLDER):
    ''' Find all relative paths for files that contain match_string in
        subfolders of data_folder. Store these filenames in a list '''

    datafile_paths = []

    for subfolder in os.listdir(data_folder):
        subfolder_path = os.path.join(data_folder, subfolder)
        
        # check that the item is a directory
        if os.path.isdir(subfolder_path):
            
            # for each subfolder, check for .json files that contain the matched string
            for filename in os.listdir(subfolder_path):
                
                if filename.endswith('.json') and match_string in filename:
                    # add each relative filepath to the list
                    relative_path = os.path.join(subfolder, filename)
                    datafile_paths.append(relative_path)
                
    return datafile_paths
    

In [13]:
def get_relative_paths_regex(match_string, data_folder=data_strings.DATA_FOLDER):
    ''' Find all relative paths for files that contain match_string in
        subfolders of data_folder. Store these filenames in a dictionary
        with the filename pseudonym as a key. (Useful for combining different
        files from the same player, e.g. solos) '''
    
    datafile_paths = {}

    # regex for identifier pseudonym 
    pattern = re.compile(r'([A-Za-z]{2}\d{2})')

    # check that the item is a directory
    for subfolder in os.listdir(data_folder):
        subfolder_path = os.path.join(data_folder, subfolder)
        
        # check that the item is a directory
        if os.path.isdir(subfolder_path):
            
            # for each subfolder, check for .json files that contain the matched string
            for filename in os.listdir(subfolder_path):
                
                if filename.endswith('.json') and match_string in filename:
                    match = pattern.search(filename)
                    if match:
                        pseudonym = match.group(1) 
                        full_path = os.path.join(subfolder, filename)

                        if pseudonym not in datafile_paths:
                            datafile_paths[pseudonym] = []
                            datafile_paths[pseudonym].append(full_path)

    return datafile_paths


In [14]:
def match_orders_solo(first_solos, second_solos, data_folder=data_strings.DATA_FOLDER):
    '''Ensures that the first solo sessions are correctly ordered w.r.t second solos.
       Takes 2 dictionaries, with key:value as pseudonym:filename, one for each type of solo
       session.'''

    # sort identifiers based on second_solos
    ordered_pseudonyms = sorted(second_solos.keys(), key=lambda pseudo: second_solos[pseudo])

    matched_first_solos = []
    matched_second_solos = []

    # append all values of first_solos to a list in the order of second_solos
    for pseudonym in ordered_pseudonyms:
        if pseudonym in first_solos and len(first_solos[pseudonym]) == 1:
            matched_first_solos.extend(first_solos[pseudonym])

    # also convert the second solo session filenames into a list of strings
    for pseudonym in second_solos.keys():
        matched_second_solos.extend(second_solos[pseudonym])


    return matched_first_solos, matched_second_solos


In [None]:


# First list of individual files
solo_files = get_relative_paths('Solo')

# Second list of social files (with desired pseudonym order)
social_files = get_relative_paths('Social')

# 1. Create a dict of sessions with nested pseudonyms
session_order = {}
for sf in social_files:
    # match the session number and the pseudonym string
    match = re.search(r'(\d+_\d)\\.*?_(.*?)_Social\.json', sf)
    if match:
        session, pseudonyms = match.groups()
        pseudonym_list = pseudonyms.split('_')
        session_order[session] = pseudonym_list

# 2. Group solo filenames by session and pseudonym 
# create dictionary structure to initiate any new entry to the dictionary as
# a default dict, which will contain an empty list
# Note that the argument to a defaultdict is what that defaultdict initialises for 
# any new entries
session_pseudo_files = defaultdict(lambda: defaultdict(list))
for f in solo_files:
    match = re.search(r'(\d+_\d)\\.*?_(\w+?)_(?:First|Second)Solo\.json', f)
    if match:
        session, pseudonym = match.groups()
        session_pseudo_files[session][pseudonym].append(f) # append the entire filename

# 3. Sort each pseudonym's files by timestamp (ensure FirstSolo always comes first)
for session in session_pseudo_files:
    for pseudonym in session_pseudo_files[session]:
        session_pseudo_files[session][pseudonym].sort()

# 4. Reconstruct final ordered list
final_ordered_list = []
for session in session_order:
    for pseudonym in session_order[session]:
        # extend the list with each session's pseudonym's files, or extend by an empty list
        # if these do not exist
        final_ordered_list.extend(session_pseudo_files[session].get(pseudonym, []))

# Output result
for file in final_ordered_list:
    print(file)

In [16]:

def order_solos_by_social(social_files, solo_files):
    ''' Take a list of social files, and interleaved solo files. Use social file order
        to derive session order, and social file pseudonym order to derive pseudonym order'''

    # 1. Create a dict of sessions with nested pseudonyms
    session_order = {}
    for sf in social_files:
        # match the session number and the pseudonym string
        match = re.search(r'(\d+_1)\\.*?_(.*?)_Social\.json', sf)
        if match:
            session, pseudonyms = match.groups()
            pseudonym_list = pseudonyms.split('_')
            session_order[session] = pseudonym_list

    # 2. Group solo filenames by session and pseudonym 
    # create dictionary structure to initiate any new entry to the dictionary as
    # a default dict, which will contain an empty list
    # Note that the argument to a defaultdict is what that defaultdict initialises for 
    # any new entries
    session_pseudo_files = defaultdict(lambda: defaultdict(list))
    for f in solo_files:
        match = re.search(r'(\d+_1)\\.*?_(\w+?)_(?:First|Second)Solo\.json', f)
        if match:
            session, pseudonym = match.groups()
            session_pseudo_files[session][pseudonym].append(f) # append the entire filename

    # 3. Sort each pseudonym's files by timestamp (ensure FirstSolo always comes first)
    for session in session_pseudo_files:
        for pseudonym in session_pseudo_files[session]:
            session_pseudo_files[session][pseudonym].sort()

    # 4. Reconstruct final ordered list
    final_ordered_list = []
    for session in session_order:
        for pseudonym in session_order[session]:
            # extend the list with each session's pseudonym's files, or extend by an empty list
            # if these do not exist
            final_ordered_list.extend(session_pseudo_files[session].get(pseudonym, []))

    return final_ordered_list

In [17]:
def get_all_relative_paths(data_folder=data_strings.DATA_FOLDER):
    ''' Get all filepaths for social and solo sessions from a directory. Order the solo sessions identically '''
    
    # get all filenames 
    social = get_relative_paths('Social', data_folder=data_folder)
    first_solo = get_relative_paths_regex('FirstSolo', data_folder=data_folder)
    second_solo = get_relative_paths_regex('SecondSolo', data_folder=data_folder)
    # ensure solo lists keep the same order
    ordered_first_solo, ordered_second_solo = match_orders_solo(first_solo, second_solo)

    # combine solos into a single list
    solos = [filename for pair in zip(ordered_first_solo, ordered_second_solo) for filename in pair]

    social_ordered_solos = order_solos_by_social(social, solos)
    

    return social, social_ordered_solos

In [18]:
def get_all_relative_paths_split_solos(data_folder=data_strings.DATA_FOLDER):
    ''' Get all filepaths for social and solo sessions from a directory. Order the solo sessions identically '''
    
    social = get_relative_paths('Social', data_folder=data_folder)
    first_solo = get_relative_paths_regex('FirstSolo', data_folder=data_folder)
    second_solo = get_relative_paths_regex('SecondSolo', data_folder=data_folder)
    ordered_first_solo, ordered_second_solo = match_orders_solo(first_solo, second_solo)

    return social, ordered_first_solo, ordered_second_solo

In [19]:
social, social_ordered_solos = get_all_relative_paths()

In [27]:
social

['240913_1\\2024-09-13_11-31-00_YW13_JL13_Social.json',
 '240927_1\\2024-09-27_14-25-20_SH27_EN27_Social.json',
 '241017_1\\2024-10-17_14-28-40_SP17_AW17_Social.json',
 '241017_2\\2024-10-17_16-41-38_ZH17_EM17_Social.json',
 '241112_1\\2024-11-12_13-31-14_KA12_WM12_Social.json',
 '241112_2\\2024-11-12_15-23-24_FA12_SL12_Social.json',
 '241113_1\\2024-11-13_14-18-54_NK13_RD13_Social.json',
 '241113_2\\2024-11-13_15-28-07_YL13_HC13_Social.json',
 '241119_1\\2024-11-19_14-24-49_AV19_XG19_Social.json',
 '241119_2\\2024-11-19_15-22-56_SB19_HH19_Social.json',
 '241120_1\\2024-11-20_14-17-44_JS20_RR20_Social.json',
 '241120_2\\2024-11-20_15-16-21_ZS20_VC20_Social.json',
 '241203_1\\2024-12-03_14-31-51_PO03_NN03_Social.json',
 '241203_2\\2024-12-03_15-27-28_EX03_BC03_Social.json',
 '241210_1\\2024-12-10_14-21-17_TE10_TK10_Social.json',
 '241210_2\\2024-12-10_15-20-11_RK10_RU10_Social.json',
 '241219_1\\2024-12-19_15-28-24_JU19_SY19_Social.json',
 '241220_1\\2024-12-20_17-44-50_KS20_CS20_Social

In [28]:
social_ordered_solos

['240913_1\\2024-09-13_11-23-37_YW13_FirstSolo.json',
 '240913_1\\2024-09-13_11-53-34_YW13_SecondSolo.json',
 '240913_1\\2024-09-13_11-23-50_JL13_FirstSolo.json',
 '240913_1\\2024-09-13_11-53-56_JL13_SecondSolo.json',
 '240927_1\\2024-09-27_14-14-31_SH27_FirstSolo.json',
 '240927_1\\2024-09-27_14-45-55_SH27_SecondSolo.json',
 '240927_1\\2024-09-27_14-14-42_EN27_FirstSolo.json',
 '240927_1\\2024-09-27_14-45-46_EN27_SecondSolo.json',
 '241017_1\\2024-10-17_14-20-28_SP17_FirstSolo.json',
 '241017_1\\2024-10-17_14-50-03_SP17_SecondSolo.json',
 '241017_1\\2024-10-17_14-20-50_AW17_FirstSolo.json',
 '241017_1\\2024-10-17_14-51-22_AW17_SecondSolo.json',
 '241112_1\\2024-11-12_13-19-22_KA12_FirstSolo.json',
 '241112_1\\2024-11-12_13-50-23_KA12_SecondSolo.json',
 '241112_1\\2024-11-12_13-18-59_WM12_FirstSolo.json',
 '241112_1\\2024-11-12_13-50-20_WM12_SecondSolo.json',
 '241113_1\\2024-11-13_14-12-43_NK13_FirstSolo.json',
 '241113_1\\2024-11-13_14-37-06_NK13_SecondSolo.json',
 '241113_1\\2024-11