In [22]:
import subprocess as subp, numpy as np, pandas as pd, os
from subprocess import check_output
from pathlib import Path

In [18]:
def buildMatrix(matrix, sample_list):
    
    def buildHeaderRow(ls):
        '''takes the list of sample idx and returns the mclrows bit'''
        ls.insert(0, '')
        ls.append('$')
        print(ls)
        return ' '.join(ls)


    def buildRow(n, ls):
        '''takes a list and its index and returns the formatted row'''
        ls = ['{0}:{1}'.format(i, x) for i, x in enumerate(ls) if i != n]
        ls.append('$')
        return '{0}\t{1}'.format(str(n), ' '.join(ls))

    template = "\
(mclheader\n\
mcltype matrix\n\
dimensions {0}x{0}\n\
)\n\
(mclmatrix\n\
begin\n\
{1}\n\
)"

    if matrix.shape[0] != matrix.shape[1] or matrix.shape[0] != len(sample_list):
        raise ValueError('buildMatrix encountered malformed matrix!')

    # dom_text = buildHeaderRow([str(x) for x in range(len(sample_list))])
    row_text = [buildRow(i, row) for i, row in enumerate(matrix)]

    return template.format(len(sample_list), '\n'.join(row_text))


def mcl(currString, tabpath, iValue, piValue, raw=False):
    '''
    (dict, list, num, num) -> [[string]]
    helper function for repeatedly running mcl over an array of I and PI values.
    '''
    def mcl_pi():
        return check_output(["mcl", tempname, "-use-tab", tabpath, "-I", str(iValue), "-o", "-", "-pi", str(piValue), "-q", "x", "-V", "all", '-te', '1'])
    
    def mcl_nopi():
        return check_output(["mcl", tempname, "-use-tab", tabpath, "-I", str(iValue), "-o", "-", "-q", "x", "-V", "all", '-te', '1'])
    
    def mcl_notab_pi():
        return check_output(["mcl", tempname, "-I", str(iValue), "-o", "-", "-pi", str(piValue), "-q", "x", "-V", "all", '-te', '1'])
    
    def mcl_notab_nopi():
        return check_output(["mcl", tempname, "-I", str(iValue), "-o", "-", "-q", "x", "-V", "all", '-te', '1'])
    
    tempname = 'temp{}.mci'.format(os.getpid())
    
    with open(tempname, 'w') as temp:
        temp.write(currString)
    
    if tabpath is None and piValue > 0:
        result = mcl_notab_pi()
    elif tabpath is None:
        result = mcl_notab_nopi()
    else:
        if piValue > 0:
            result = mcl_pi()
        else:
            result = mcl_nopi()
            
    result = bytes.decode(result)
    os.remove(tempname)

    if raw:
        return result
    
    else: 
        results = [line.split('\t') for line in result.rstrip('\n').split("\n")]
        return results

def recordTab(sample_list, tabpath):
    #writes the single tab file
    with open(tabpath, 'w') as persistentTab:
        for index, key in enumerate(sample_list):
            persistentTab.write("{0} {1}\n".format(str(index), key))

In [13]:
global sample_list
global tab_path
global ival
global pival

input = Path("/d/data/toxo/fs_matrix.txt")
tab_path = Path("tab.txt")
dist_df = pd.read_csv(input, sep=" ", header = 0, index_col = 0)

dists = dist_df.to_numpy()
sample_list = dist_df.columns


In [37]:
#reformat into mcl. 
ival = 7.3
pival = 5.1
mcl_matrix = buildMatrix(dists, sample_list)
recordTab(sample_list, tab_path)
# call mcl
groups = mcl(mcl_matrix, tab_path, ival, pival, raw=False)
groups

[['CAST', 'COUG', 'GT1', 'P89', 'RH88', 'ROD', 'TGSHUS28'],
 ['ARI', 'ARI.MG', 'B41', 'RAY', 'TGSK', 'X3142'],
 ['B73', 'ME49', 'PRU', 'S23', 'S30', 'TGGOATUS21'],
 ['CTG', 'M7741', 'S22', 'SOU', 'VEG']]