## RasMMA usage
Usage of RasMMA.ipynb

In [None]:
# do clustering and output two pickle files. (@_intermediate.pickle and @_nameDict.pickle)
% run RasMMA.ipynb
import os

def startClustering(data_directory, tag, outputPath, thresholdValue=None):
    if not os.listdir(data_directory):
        print("Data Empty")
        return
    
    # Create Directories if didn't exist
    if not os.path.isdir(outputPath): os.makedirs(outputPath)
    pickleDir = outputPath + "pickle/"
    if not os.path.isdir(pickleDir): os.makedirs(pickleDir)
        
    # link RasMMA algorithm logic
    intermediatePool, initialDict, roundInfos, residual = do_RasMMA_clustering(data_directory,
                                                                               tag,
                                                                               outputPath,
                                                                               thresholdValue)

    # saving intermediatePool as pickle file
    with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
        pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving initialNames dict as pickle file
    with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
        pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
        pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if(residual is not None):
        # saving round information dict as pickle file
        with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
            pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Main Cell
usage example of get clustering results

In [None]:
# basic global inputs variable
def main(data_directory, tag, outputPath, manualThresholdNumber):
    import datetime
    date_time = datetime.datetime.now()
    print(date_time.strftime("%Y-%b-%d %H:%M"))
    startClustering(data_directory, tag, outputPath, manualThresholdNumber)
    date_time = datetime.datetime.now()
    print(date_time.strftime("%Y-%b-%d %H:%M"))


# manualThresholdNumber = 0.8 # defined the threshold of merge score
# familyName = "domaiq"
# data_directory = "/home/master/r07725027/dataset/aries_v2_simplified_15up/9.domaiq/" # data trace directory
# tag = familyName + "_0.8" # used for naming pickle
# outputPath = "/home/master/r07725027/dataset/rasMMA-output/" + tag + "/"
# pickleDir = outputPath + "pickle/"

# print(outputPath, pickleDir)
# main(data_directory, tag, outputPath, manualThresholdNumber)

## Run RasMMA in Multi-processes for Aries_V2_simplified_15up

In [None]:
manualThresholdNumber = 0.8 # defined the threshold of merge score

In [None]:
import sys
from multiprocessing import Pool, Manager

extract_family_range = [1, 15] # set family ID range you want to run rasMMA
family_folder_path = '/home/master/r07725027/dataset/aries_v2_simplified_15up' # change to your dataset path
output_path_root = "/home/master/r07725027/dataset/rasMMA-output" # change to output path
pickle_dir = ''

families = os.listdir(family_folder_path)

# return family directory within range
def get_family_names():
    family_names = []
    for family in families:
        (family_num, family_name) = family.split('.')
        family_num = int(family_num)

        if extract_family_range[0] <= family_num <= extract_family_range[1]:
            family_names.append(family)
            
    return family_names

# extract function for multiprocessing
def extract(family_name, error_messages):
    global pickle_dir
    
    data_directory = f'{family_folder_path}/{family_name}/'
    tag = family_name.split('.')[1] + "_0.8" # used for naming pickle
    output_path = f'{output_path_root}/{tag}/'
    pickle_dir = f'{output_path}pickle/'
    
    try:
        main(data_directory, tag, output_path, manualThresholdNumber)
    except Exception as e:
        error_type = sys.exc_info()[0]
        error = str(e)
        print('Error: ' + family_name, type(e).__name__, ': ', error)
        error_messages.put(f'{tag} -> {type(e).__name__}: {error}')
    

def main_extract():
    # shared memory error list between processes
    manager = Manager()
    error_messages = manager.Queue()

    # use multiprocess
    with Pool(processes = 15) as pool: # define how many processes to run
        # use starmap to map job to process, and pass multiple args to extract function
        pool.starmap(extract, [(family, error_messages) for family in get_family_names()])
    
    print('\n----- Error Messages -----')
    while error_messages.empty() is False:
        print(error_messages.get())
        
main_extract()