In [1]:
import os, pickle
import pandas as pd
import numpy as np

In [2]:
% run FeatureTrace.ipynb
def getFeatureProfiles(filePaths):
    featureObj = FeatureTrace
    featureProfiles = dict()
    for f in filePaths:
        fName = os.path.basename(f)
        featureProfile = featureObj(f).getTrace_noContainTS()
        featureProfiles[fName] = featureProfile
    return featureProfiles

In [3]:
% run Alignment_Fast3.ipynb

# Doing global alignment and Calculate common motif.
# will return a common motif dict
def do_alignment(rep1, rep2, alignType):
    if alignType == 'local':
        return localAlign( rep1, rep2, 10, -1, 0)[2]
    elif alignType == 'global':
        commonSeq = []
        gsa_result = globalAlign(rep1, rep2, 10, -1, 0)[2]

        for (motifLeft, motifRight) in gsa_result:
            leftIndex, leftAPI = motifLeft
            rightIndex, rightAPI = motifRight
            if leftAPI == rightAPI:
                commonSeq.append(leftAPI)
        return commonSeq
    
    else:
        return None

In [4]:
def getTestingResult_localAlign(testFileTrace, modelSeq):
    commonSeq = do_alignment(testFileTrace, modelSeq, 'local')
    return len(commonSeq)

def getTestingResult_globalAlign(testFileTrace, modelSeq):
    commonSeq = do_alignment(testFileTrace, modelSeq, 'global')
    return len(commonSeq), commonSeq
def getTestingResult_setIntersection(testFileTrace, modelSeq):
    return len(set(testFileTrace).intersection(set(modelSeq))) / len(set(testFileTrace))

In [30]:
# familyName = "eggnog"
# gen = "main"
def doRasmmaTesting(outputFileName, gen, rasmmaModelDict, data_dir_path, my_tree_weigth = None):

    testFilePaths = [data_dir_path+f for f in os.listdir(data_dir_path) if os.path.isfile(data_dir_path+f)]
    testFile_featureTraces = getFeatureProfiles(testFilePaths)

    result_dict = dict()

    for testFileName, testFileTrace in testFile_featureTraces.items():
        scoreLenDict = dict()
        for labelTag, modelParams in rasmmaModelDict.items():
            treeWeight, modelSeq = modelParams
            oneFamily_score = getTestingResult_globalAlign(testFileTrace, modelSeq)
            if my_tree_weigth != None:
                treeWeight = my_tree_weigth
            scoreLenDict[labelTag] = oneFamily_score * treeWeight
        testFileName = testFileName[0:6] + '_' + testFileName.split("_")[1].split(".")[0]
        result_dict[testFileName] = scoreLenDict

    import pickle
    with open(outputFileName, 'wb') as fHandle:
        pickle.dump(result_dict, fHandle, protocol=pickle.HIGHEST_PROTOCOL)
        fHandle.close()
#     if not os.path.isdir(outputDir): os.makedirs(outputDir)

#     with open(outputDir + "result_"+familyName+".pickle", 'wb') as fHandle:
#         pickle.dump(result_dict, fHandle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Saving Result:", familyName)

### Do Testing only using RasMMA rep.

In [6]:
%run CollectForestInfo.ipynb
import os

#### Building Model

In [7]:
base_dir = "output/top3_party_0622/"
familyDirs = [base_dir+f+'/' for f in os.listdir(base_dir)]

rasmmaModel_dict = dict()
weight_dict = dict()

ignoreFamilys = set()

for fam in familyDirs:
    pickleDir = fam + 'pickle/'
    tag = pickleDir.split('/')[-3]
    interPkl = pickleDir + tag + "_intermediate.pickle"
    residualPkl = pickleDir + tag + "_residual.pickle"

    forestInfo = CollectForestInfo(interPkl,
                           residualPkl,
                           True) # one pickle is a forest
    
    forestMemberCount = forestInfo.getForestMemberCount()
    weight_dict[tag.split("_")[0]] = dict()
    for treeName in forestInfo.getTreeRootNameList():
        labelName = tag+'_'+treeName
        memberCount = len(forestInfo.getTreeMembers(treeName))
        repSeq = forestInfo.getRepAPISeq(treeName)
#         if len(repSeq) > 10:
        rasmmaModel_dict[labelName] = (memberCount/forestMemberCount, repSeq)
        weight_dict[tag.split("_")[0]][treeName] = (memberCount/forestMemberCount,
                                                    len(repSeq), memberCount)

for fName, trs in weight_dict.items():
    save = False
    for tr, info in trs.items():
        if info[1] > 10 and info[2] > 2:
            save = True
            break
    if not save:
        ignoreFamilys.add(fName)
        
    
print("=== Finish building model ===")
print("Residual Families:", len(familyDirs) - len(ignoreFamilys))
print(len(ignoreFamilys), "Families did not match requirement: \n",ignoreFamilys)

=== Finish building model ===
Residual Families: 28
23 Families did not match requirement: 
 {'jadtre', 'nsanti', 'startpage', 'wenper', 'vundo', 'ogimant', 'jeefo', 'lydra', 'gamarue', 'bayrob', 'ganelp', 'devir', 'fujacks', 'koutodoor', 'simile', 'webprefix', 'klez', 'virlock', 'cycbot', 'rimecud', 'kwbot', 'fareit', 'cerber'}


### RasMMA(only) Testing

In [None]:
#### top3 testing

inputParams = list()
for famDir in familyDirs:
    familyName = famDir.split('/')[-2].split('_')[0]
    inputParams.append(familyName)
print("=== Prepared Input Parameters ===")

test_base_path = "11939data/no_consensus_all/"
outputDir = "output/omg_testing/11939_no_consensus/global_align/"

tree_weigth = 1

for inputPar in inputParams:
    if inputPar not in ignoreFamilys:
        familyName = inputPar
#         data_dir = test_base_path + familyName + '/'
#         opfName = outputDir + "result_"+familyName+".pickle"
        data_dir = test_base_path
        opfName = outputDir + 'result_no_consensus.pickle'
        # if family has done, skip it. (Maybe caused from memory error before.)
        if os.path.isfile(opfName):
            continue

        # if family didn't exist, skip it.
        if not os.path.isdir(data_dir):
            print("no this dir:", familyName)
            continue

        print("--> use testing data:", familyName)
        doRasmmaTesting(opfName, "", rasmmaModel_dict, data_dir, tree_weigth)
        print("--> Finish testing.")

=== Prepared Input Parameters ===
--> use testing data: allaple
