## RasMMA usage
Usage of RasMMA.ipynb

In [1]:
# do clustering and output two pickle files. (@_intermediate.pickle and @_nameDict.pickle)
% run RasMMA.ipynb
import os

def startClustering(data_directory, tag, outputPath, thresholdValue=None):
    if not os.listdir(data_directory):
        print("Data Empty")
        return
    
    # Create Directories if didn't exist
    if not os.path.isdir(outputPath): os.makedirs(outputPath)
    pickleDir = outputPath + "pickle/"
    if not os.path.isdir(pickleDir): os.makedirs(pickleDir)
        
    # link RasMMA algorithm logic
    intermediatePool, initialDict, roundInfos, residual = do_RasMMA_clustering(data_directory,
                                                                               tag,
                                                                               outputPath,
                                                                               thresholdValue)

    # saving intermediatePool as pickle file
    with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
        pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving initialNames dict as pickle file
    with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
        pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
        pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if(residual is not None):
        # saving round information dict as pickle file
        with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
            pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Main Cell
usage example of get clustering results

In [2]:
# basic global inputs variable
def main(data_directory, tag, outputPath, manualThresholdNumber):
    import datetime
    date_time = datetime.datetime.now()
    print(date_time.strftime("%Y-%b-%d %H:%M"))
    startClustering(data_directory, tag, outputPath, manualThresholdNumber)
    date_time = datetime.datetime.now()
    print(date_time.strftime("%Y-%b-%d %H:%M"))


manualThresholdNumber = 0.8 # defined the threshold of merge score
familyName = "berbew"
data_directory = "data/top3_train/"+ familyName +"/" # data trace directory
tag = familyName + "_0.8" # used for naming pickle
outputPath = "output/RasMMA_forest/" + tag + "/"
pickleDir = outputPath + "pickle/"

main(data_directory, tag, outputPath, manualThresholdNumber)

2018-Jul-31 16:30
-- Finish Initializing --
-- Start Clustering --
Threshold set = 0.8
Round:  1
ScoreList Length in method :  153
generatedSeqNum now:  24
Round:  2
ScoreList Length in method :  66
generatedSeqNum now:  26
Round:  3
ScoreList Length in method :  36
generatedSeqNum now:  27
Round:  4
ScoreList Length in method :  28
generatedSeqNum now:  28
Round:  5
ScoreList Length in method :  21
generatedSeqNum now:  28
-- Finish Clustering --
-- Clean Temp Pickle Files --
2018-Jul-31 16:30


### Below cell used to write csv - _groupInfo, _decendants, _motifs
groupInfo.csv can see merge score.

In [3]:
import os
familyName = "berbew"
tag = familyName+"_0.8" # used for pickle name
outputPath = "output/RasMMA_forest/" + tag + "/"
pickleDir = outputPath + "pickle/"

In [4]:
import pickle

# read the results from pickle files
with open(pickleDir + tag + '_intermediate.pickle', 'rb') as handle:
    intermediate = pickle.load(handle)
with open(pickleDir + tag + '_initialDict.pickle', 'rb') as handle:
    initialDict = pickle.load(handle)
with open(pickleDir + tag + '_roundInfos.pickle', 'rb') as handle:
    roundInfos = pickle.load(handle)
    
# calculate motif lengths of all common motifs
def getMotifsLengthList(motifs):
    motifLens = list()
    for motif in motifs:
        mLen = len(motif)
        motifLens.append(mLen)
    return motifLens

def findGeneratedRoundNumber(clusterName, roundInfosDict):
    for key, value in roundInfosDict.items():
        if clusterName in value:
            return key
    return -1

import csv

descendant_dict = dict()
groupInfo_list = list()
groupMotif_dict = dict()

intermediate_list = sorted(intermediate.items(), key=lambda x : x[0])
for item in intermediate_list:
    value = item[1] # get original dict value
    score = value[0]
    clusterName = value[1][0]
    memberSet = value[2]
    motifs = value[1][1]
    
    # calculate motif lengths of all common motifs
    motifsLens = getMotifsLengthList(motifs) # is a list of numbers
    totalMotifLen = sum(motifsLens) # sum the list

    motifsCount = len(motifs)
    
    descendants = set()
    for member in memberSet:
        if member[0] == "G":
            for descendant in descendant_dict[member]:
                descendants.add(descendant)
        else:
            descendants.add(member)
        
    descendant_dict[clusterName] = descendants
    
    
    groupMotif_dict[clusterName] = motifs
    roundNumber = findGeneratedRoundNumber(clusterName, roundInfos)
    groupInfo_list.append((roundNumber, clusterName, score, memberSet, motifsCount, motifsLens, totalMotifLen))

with open(pickleDir + tag + "_descendant.pickle", 'wb') as f:
    pickle.dump(descendant_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

# write file "GroupInfo.csv" :  clusterName, score, members, motifCount, common motifs length list
with open(outputPath + tag + "_GroupInfo.csv", 'w', newline='') as infoFile:
    spamwriter = csv.writer(infoFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["Round", "ClusterName", "SimilarityScore", "Members", "MotifsCount", "Motifs_Length", "Total_MotifLength"]
    spamwriter.writerow(header)
    
    # write initial cluster informations(i.e., hooklogs)
    for key in sorted(initialDict.keys(), key = lambda x : int(x[1::])):
        # something like this: (0, "G1", "N/A", "abc", 1, 109)
        originDataRow = (0, key, "N/A", initialDict[key][0], 1, initialDict[key][1], initialDict[key][1])
        spamwriter.writerow(originDataRow)
        
    # write cluster informations
    for group in groupInfo_list:
        spamwriter.writerow(group)
        
with open(outputPath + tag + "_Descendants.csv", "w", newline='') as descFile:
    spamwriter = csv.writer(descFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "Descendant Counts", "Descendants"]
    spamwriter.writerow(header)
    for key in sorted(descendant_dict.keys(), key = lambda x : int(x[1::])):
        row = (key, len(descendant_dict[key]), descendant_dict[key])
        spamwriter.writerow(row)
        
# write file "Motifs.csv" :  clusterName, MotifNumber, apis
with open(outputPath + tag + "_Motifs.csv", 'w', newline='', encoding='utf-8') as motifFile:
    spamwriter = csv.writer(motifFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "MotifIndex", "MotifLength", "Common Motif APIs"]
    spamwriter.writerow(header)

    for key in sorted(groupMotif_dict.keys(), key = lambda x : int(x[1::])):
        group_motifs = groupMotif_dict[key]
        motifIdx = 0
        for motif in group_motifs:
            firstMotifAPI = True
            motifLen = len(motif)
            for api in motif:
                if(firstMotifAPI):
                    row = (key, motifIdx, motifLen, api)
                    firstMotifAPI = False
                else:
                    row = ("", "", "", api)
                spamwriter.writerow(row)
            motifIdx += 1
            
# output residual information of SBBGCA

with open(pickleDir + tag + '_residual.pickle', 'rb') as handle:
    residual = pickle.load(handle)
    
with open(outputPath + tag + "_GroupInfo.csv", 'a', newline='') as expandGroupInfo:
    spamwriter = csv.writer(expandGroupInfo, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "Members", "MotifLength"]
    
    spamwriter.writerow("")
    spamwriter.writerow(("Residual Clusters:","",""))
    spamwriter.writerow(header)
    
    for key, value in residual.items():
        clusterName = value[0][0]
        motifsList = value[0][1]
        motifLens = getMotifsLengthList(motifsList)
        members = value[1]
        if( len(members) == 0 ):
            row = (clusterName, "N/A", motifLens)
        else:
            row = (clusterName, members, motifLens)
            
        spamwriter.writerow(row)

### Show merge pairs in RasMMA to draw behavior forest
if someone need to draw behavior forest for visualization, Need manual..

In [None]:
# reverse clusterID by hooklogName
def findClusterID(nameDict, hooklogName):
    for key, value in nameDict.items():
        if(value == hooklogName):
            return key
    return hooklogName

def getInitialNameDict(initialDict):
    nameDict = dict()
    for key, value in initialDict.items():
        name = value[0]
        nameDict[key] = name
    return nameDict

# z[0] = g1,  z[1] = g2,  z[2] = 高度
# Create structure Z

def createStructZ(intermediate_dict, nameDict):
    import numpy as np
    Z = np.zeros((len(intermediate_dict)+1 ,4))
    intermediate_list = sorted(intermediate_dict.items(), key=lambda x:x[0])
    iterCounter = 0
    
    for item in intermediate_list:
        value = item[1] # get original dict value
        score = value[0]
        height = 1 - score # get cluster distance
        clusterName = value[1][0]
        memberSet = value[2] # members set
        memberList = getMemberList(memberSet, nameDict)
        print(clusterName," : ", score," - ", memberList)
        member1 = memberList[0][1::]
        member2 = memberList[-1][1::]
        Z[iterCounter] = [member1, member2, height, len(memberList)] # set Z element
        iterCounter+=1

    return Z

# convert memberSet to List type

def getMemberList(memberSet, nameDict):
    memberList = list()
    while(len(memberSet)>0):
        member = memberSet.pop()
        clusterID = findClusterID(nameDict, member)
        memberList.append(clusterID)
    return memberList

def createLabelList(nameDict):
    dict_keys = list(nameDict.keys())
    dict_keys.sort(key=lambda tup: int(tup[1::] )) # sort keys by number in clusterName (i.e., '31' in 'G31')
    
    labelList = list()
    for key in dict_keys:
        labelList.append( nameDict[key] )
    
    return labelList

In [None]:
# draw pics
def drawClusteringResults(picklePath, outputPath, tag, upgma_threshold):
    
    #     Dependencies
    import pickle
    import scipy
    import scipy.cluster.hierarchy as sch
    import matplotlib.pylab as plt
    %matplotlib inline
    
    # read the results from pickle files
    with open(picklePath + tag + '_intermediate.pickle', 'rb') as handle:
        intermediate = pickle.load(handle)
    with open(picklePath + tag + '_initialDict.pickle', 'rb') as handle:
        initialDict = pickle.load(handle)
    with open(picklePath + tag + '_roundInfos.pickle', 'rb') as handle:
        roundInfos = pickle.load(handle)
        
    initialNameDict = getInitialNameDict(initialDict)
    print("Original Names : ", initialNameDict)
    print("round informations", roundInfos)
    
    # It have to create the Z structure for drawing purpose.
    Z = createStructZ(intermediate, initialNameDict)
    
    label_list = createLabelList(initialNameDict) # create graph labels by nameDict
    (orig_x, orig_y) = plt.rcParams['figure.figsize']
    plt.rcParams['figure.figsize'] = (6, 10) #---input

    # P = sch.dendrogram(Z, color_threshold = upgma_threshold, orientation = 'right') # no label
    P = sch.dendrogram(Z, color_threshold = upgma_threshold, labels = label_list, orientation = 'right')

#     plt.axvline(x=upgma_threshold, linewidth=1, color='black', linestyle='--')
    locs, labels = plt.yticks()
    # plt.xticks(  np.arange(0,1.1,0.1)) #---Align axis-x 900(0, 0.35, 0.05) 909(0,0.6,0.1) 855(0, 0.9, 0.1)
    plt.setp(labels, fontsize = 14)
    plt.tight_layout()

    plt.rcParams['figure.figsize'] = (orig_x, orig_y)
    plt.savefig(outputPath+'SBBGCA_'+tag+'.pdf', dpi=300)

In [None]:
# usage example of draw hierarchy graph of clustering results
familyName = 'eggnog'
tag = familyName + "_0.8" # used for pickle name
# tag = '27fam_cross'
outputPath = "output/RasMMA-test/"+tag+"/"
pickleDir = outputPath + "pickle/"
drawClusteringResults(pickleDir, outputPath, tag, 0.01)