## Can I generate a graph structure given Foldseek similarity search?

In [3]:
import os
import subprocess
import shutil

In [2]:
EXEC_PATH = os.path.join("..", "exec", "foldseek", "bin", "foldseek")
print(EXEC_PATH)

../exec/foldseek/bin/foldseek


In [4]:
class Executable:
	def __init__(self, path):
		if not os.path.exists(path):
			raise ImportError(f"Could not find {path}")
		self.path = path
	def __call__(self, cmd=""):
		return subprocess.check_output(f"{self.path} {cmd}", shell=True).decode()
	def __repr__(self):
		return f"Executable(path='{self.path}')"

class Foldseek(Executable):
    def __init__(self):
        super().__init__(EXEC_PATH)
        self.temp_dir = os.path.join(".", "temp")
        self.out_file = os.path.join(self.temp_dir, "output")

    def _delete_temp(self):
        shutil.rmtree(self.temp_dir)
        
    def _parse_out_file(self):
        with open(self.out_file, "r") as f:
            lines = f.readlines()
            parsed_lines = []
            for line in lines:
                parsed_line = []
                for column in line.strip("\n").split("\t"):
                    try:
                        column = float(column)
                    except ValueError:
                        pass
                    parsed_line.append(column)
                parsed_lines.append(parsed_line)

            return parsed_lines

    def search(self, query, target, flags = "--format-output query,target,prob", clean=True):
        stdout = self(f"easy-search {query} {target} {self.out_file} {self.temp_dir} {flags}")
        search_out = self._parse_out_file()

        if clean:
            self._delete_temp()

        return search_out, stdout

In [7]:
fs = Foldseek()
fs

Executable(path='../exec/foldseek/bin/foldseek')

In [27]:
venome = "/Users/donnybertucci/datasets/venome"

In [12]:
res = fs.search(query=venome, target=venome)

In [15]:
sim, stdout = res
sim

[['Gh_comp10207_c0_seq2', 'Gh_comp10207_c0_seq2', 1.0],
 ['Gh_comp10207_c0_seq2', 'Gh_comp1623_c0_seq1', 1.0],
 ['Gh_comp10207_c0_seq2', 'Gh_comp1699_c0_seq1', 1.0],
 ['Gh_comp10207_c0_seq2', 'Lh14_comp513_c0_seq1', 1.0],
 ['Gh_comp10207_c0_seq2', 'Lh14_comp138_c0_seq1', 0.008],
 ['Gh_comp10207_c0_seq2', 'Lb17_comp9947_c0_seq1', 0.008],
 ['Gh_comp10207_c0_seq2', 'Gh_comp274_c0_seq1', 0.007],
 ['Gh_comp10207_c0_seq2', 'Gh_comp448_c0_seq1', 0.007],
 ['Gh_comp10207_c0_seq2', 'Gh_comp2524_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Gh_comp59_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lb17_comp1218_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lh14_comp42_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lb17_comp4291_c0_seq2', 0.0],
 ['Gh_comp10207_c0_seq2', 'Gh_comp1373_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lb17_comp12_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lh14_comp54_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Lh14_comp1266_c0_seq1', 0.0],
 ['Gh_comp10207_c0_seq2', 'Gh_comp486_c0_seq1', 0.

In [30]:
import json

def export(infile, results, outfile):
	filenames = os.listdir(infile)
	export = {"nodes": filenames, "edges": results}
	with open(outfile, "w") as f:
		f.write(json.dumps(export))

export(venome, sim, "test.json")