In [1]:
import os

In [7]:
# reverse clusterID by hooklogName

def findClusterID(nameDict, hooklogName):
    for key, value in nameDict.items():
        if(value == hooklogName):
            return key
    return hooklogName

def getInitialNameDict(initialDict):
    nameDict = dict()
    for key, value in initialDict.items():
        name = value[0]
        nameDict[key] = name
    return nameDict

# convert memberSet to List type

def getMemberList(memberSet, nameDict):
    memberList = list()
    while(len(memberSet)>0):
        member = memberSet.pop()
        clusterID = findClusterID(nameDict, member)
        memberList.append(clusterID)
    return memberList

# z[0] = g1,  z[1] = g2,  z[2] = 高度
# Create structure Z

def createStructZ(intermediate_dict, nameDict):
    import numpy as np
    Z = np.zeros((len(intermediate_dict) ,4))
    
    intermediate_list = sorted(intermediate_dict.items(), key=lambda x:x[0])
    
    iterCounter = 0
    for item in intermediate_list:
        value = item[1] # get original dict value
        score = value[0]
        height = 1 - score # get cluster distance
        clusterName = value[1][0]
        memberSet = value[2] # members set
        memberList = getMemberList(memberSet, nameDict)
        print(clusterName, " : ", memberList)
        member1 = memberList[0][1::]
        member2 = memberList[1][1::]
        
        Z[iterCounter] = [member1, member2, height, len(memberList)] # set Z element
        iterCounter+=1
        
    return Z

def createLabelList(nameDict):
    dict_keys = list(nameDict.keys())
    dict_keys.sort(key=lambda tup: int(tup[1::] )) # sort keys by number in clusterName (i.e., '31' in 'G31')
    
    labelList = list()
    for key in dict_keys:
        labelList.append( nameDict[key] )
    
    return labelList

# do clustering and output two pickle files. (@_intermediate.pickle and @_nameDict.pickle)
% run RasMMA.ipynb
import os

def startClustering(data_directory, tag, outputPath, thresholdValue=None):
    
    if not os.listdir(data_directory):
        print("Data Empty")
        return
    
    # Create Directories if didn't exist
    if not os.path.isdir(outputPath): os.makedirs(outputPath)
    if not os.path.isdir(pickleDir): os.makedirs(pickleDir)
    
    intermediatePool, initialDict, roundInfos, residual = do_RasMMA_clustering(data_directory, tag, outputPath, thresholdValue)

    # saving intermediatePool as pickle file
    with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
        pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving initialNames dict as pickle file
    with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
        pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
        pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if(residual is not None):
        # saving round information dict as pickle file
        with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
            pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
def output_reporting(pickleDir, tag, outputPath):
    import pickle

    # read the results from pickle files
    with open(pickleDir + tag + '_intermediate.pickle', 'rb') as handle:
        intermediate = pickle.load(handle)
    with open(pickleDir + tag + '_initialDict.pickle', 'rb') as handle:
        initialDict = pickle.load(handle)
    with open(pickleDir + tag + '_roundInfos.pickle', 'rb') as handle:
        roundInfos = pickle.load(handle)

    # calculate motif lengths of all common motifs
    def getMotifsLengthList(motifs):
        motifLens = list()
        for motif in motifs:
            mLen = len(motif)
            motifLens.append(mLen)
        return motifLens

    def findGeneratedRoundNumber(clusterName, roundInfosDict):
        for key, value in roundInfosDict.items():
            if clusterName in value:
                return key
        return -1

    import csv

    descendant_dict = dict()
    groupInfo_list = list()
    groupMotif_dict = dict()

    intermediate_list = sorted(intermediate.items(), key=lambda x : x[0])
    for item in intermediate_list:
        value = item[1] # get original dict value
        score = value[0]
        clusterName = value[1][0]
        memberSet = value[2]
        motifs = value[1][1]

        # calculate motif lengths of all common motifs
        motifsLens = getMotifsLengthList(motifs) # is a list of numbers
        totalMotifLen = sum(motifsLens) # sum the list

        motifsCount = len(motifs)

        descendants = set()
        for member in memberSet:
            if member[0] == "G":
                for descendant in descendant_dict[member]:
                    descendants.add(descendant)
            else:
                descendants.add(member)

        descendant_dict[clusterName] = descendants


        groupMotif_dict[clusterName] = motifs
        roundNumber = findGeneratedRoundNumber(clusterName, roundInfos)
        groupInfo_list.append((roundNumber, clusterName, score, memberSet, motifsCount, motifsLens, totalMotifLen))

    with open(pickleDir + tag + "_descendant.pickle", 'wb') as f:
        pickle.dump(descendant_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

    # write file "GroupInfo.csv" :  clusterName, score, members, motifCount, common motifs length list
    with open(outputPath + tag + "_GroupInfo.csv", 'w', newline='') as infoFile:
        spamwriter = csv.writer(infoFile, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = ["Round", "ClusterName", "SimilarityScore", "Members", "MotifsCount", "Motifs_Length", "Total_MotifLength"]
        spamwriter.writerow(header)

        # write initial cluster informations(i.e., hooklogs)
        for key in sorted(initialDict.keys(), key = lambda x : int(x[1::])):
            # something like this: (0, "G1", "N/A", "abc", 1, 109)
            originDataRow = (0, key, "N/A", initialDict[key][0], 1, initialDict[key][1], initialDict[key][1])
            spamwriter.writerow(originDataRow)

        # write cluster informations
        for group in groupInfo_list:
            spamwriter.writerow(group)

    with open(outputPath + tag + "_Descendants.csv", "w", newline='') as descFile:
        spamwriter = csv.writer(descFile, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = ["ClusterName", "Descendant Counts", "Descendants"]
        spamwriter.writerow(header)
        for key in sorted(descendant_dict.keys(), key = lambda x : int(x[1::])):
            row = (key, len(descendant_dict[key]), descendant_dict[key])
            spamwriter.writerow(row)

    # write file "Motifs.csv" :  clusterName, MotifNumber, apis
    with open(outputPath + tag + "_Motifs.csv", 'w', newline='', encoding='utf-8') as motifFile:
        spamwriter = csv.writer(motifFile, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = ["ClusterName", "MotifIndex", "MotifLength", "Common Motif APIs"]
        spamwriter.writerow(header)

        for key in sorted(groupMotif_dict.keys(), key = lambda x : int(x[1::])):
            group_motifs = groupMotif_dict[key]
            motifIdx = 0
            for motif in group_motifs:
                firstMotifAPI = True
                motifLen = len(motif)
                for api in motif:
                    if(firstMotifAPI):
                        row = (key, motifIdx, motifLen, api)
                        firstMotifAPI = False
                    else:
                        row = ("", "", "", api)
                    spamwriter.writerow(row)
                motifIdx += 1

    # output residual information of SBBGCA

    with open(pickleDir + tag + '_residual.pickle', 'rb') as handle:
        residual = pickle.load(handle)

    with open(outputPath + tag + "_GroupInfo.csv", 'a', newline='') as expandGroupInfo:
        spamwriter = csv.writer(expandGroupInfo, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = ["ClusterName", "Members", "MotifLength"]

        spamwriter.writerow("")
        spamwriter.writerow(("Residual Clusters:","",""))
        spamwriter.writerow(header)

        for key, value in residual.items():
            clusterName = value[0][0]
            motifsList = value[0][1]
            motifLens = getMotifsLengthList(motifsList)
            members = value[1]
            if( len(members) == 0 ):
                row = (clusterName, "N/A", motifLens)
            else:
                row = (clusterName, members, motifLens)

            spamwriter.writerow(row)

In [9]:
# basic global inputs variable

manualThresholdNumber = 0.7 # defined the threshold of merge score

base_dir = "11939data/myMismatchLonerTest/"
output_base = "output/myMismatchLonerTest/"

skipFam = {''}

for familyName in os.listdir(base_dir):
    if familyName + '_' + str(manualThresholdNumber) in os.listdir(output_base) or familyName in skipFam:
        continue
    fam_data_dir = base_dir + familyName + '/'
    tag = familyName + "_" + str(manualThresholdNumber) # used for pickle name

    outputPath = output_base + tag + "/"
    pickleDir = outputPath + "pickle/"
    print("=== Processing Family - ", familyName , " ===")
    startClustering(fam_data_dir, tag, outputPath, manualThresholdNumber)
    output_reporting(pickleDir, tag, outputPath)
    print("=== Family - ", familyName , " Finished ===")

=== Processing Family -  berbew  ===
-- Finish Initializing --
-- Start Clustering --
Threshold set = 0.7
Round:  1
ScoreList Length in method :  210
generatedSeqNum now:  27
Round:  2
ScoreList Length in method :  105
generatedSeqNum now:  29
Round:  3
ScoreList Length in method :  66
generatedSeqNum now:  30
Round:  4
ScoreList Length in method :  55
generatedSeqNum now:  31
Round:  5
ScoreList Length in method :  45
generatedSeqNum now:  31
-- Finish Clustering --
-- Clean Temp Pickle Files --
=== Family -  berbew  Finished ===


### Use for cross family (a.k.a BegC)

In [1]:
import os, pickle
%run CollectForestInfo.ipynb

In [4]:
base_dir = 'output/top3_party_0622/'

tag = '28fam_cross'

outputPath = 'output/begC/'+tag+'/'

famNames = os.listdir(base_dir)

cand_clusters = dict()
candCounter = 0
for fName in famNames:
    pickleDir = base_dir+fName+'/pickle/'
    interPkl = pickleDir + fName + "_intermediate.pickle"
    residualPkl = pickleDir + fName + "_residual.pickle"

    forestInfo = CollectForestInfo(interPkl,
                           residualPkl,
                           True) # one pickle is a forest

    for treeName in forestInfo.getTreeRootNameList():
        labelName = fName+'_'+treeName
        memberCount = len(forestInfo.getTreeMembers(treeName))
        repSeq = forestInfo.getRepAPISeq(treeName)
        if memberCount > 2 and len(repSeq) > 10:
            clusterName = 'G'+str(candCounter)
            R = (clusterName, [repSeq])
#             R = (clusterName, [(repSeq, 0, len(repSeq)-1)] )
            clusterMembers = set()
            clusterMembers.add(labelName)
            cand_clusters[candCounter] = (R, clusterMembers)
            candCounter += 1

In [None]:
intermediatePool, initialDict, roundInfos, residual = clusterInitializedReps(cand_clusters, tag, outputPath, 0.8)

pickleDir = outputPath + 'pickle/'
if not os.path.isdir(pickleDir): os.makedirs(pickleDir)

# saving intermediatePool as pickle file
with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
    pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

# saving initialNames dict as pickle file
with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
    pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# saving round information dict as pickle file
with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
    pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)

if(residual is not None):
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
        pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)