Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins
# This file is part of Vidjil <http://www.vidjil.org>,
# High-throughput Analysis of V(D)J Immune Repertoire.
# Copyright (C) 2011-2017 by Bonsai bioinformatics
# at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
# Contributors:
# Marc Duez <marc.duez@vidjil.org>
# Mathieu Giraud <mathieu.giraud@vidjil.org>
# The Vidjil Team <contact@vidjil.org>
#
# "Vidjil" is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# "Vidjil" is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
from __future__ import print_function
from __future__ import division
import check_python_version
import sys
import json
import argparse
import os.path
import os
import datetime
import subprocess
import tempfile
import math
import gzip
from operator import itemgetter, le
from utils import *
from defs import *
from collections import defaultdict
from pipes import quote
FUSE_VERSION = "vidjil fuse"
TOOL_SIMILARITY = "../algo/tools/similarity"
SIMILARITY_LIMIT = 100
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL']
AVAILABLE_AXES = [
"top", "germline", "name",
"seg5", "seg4", "seg4a", "seg4b", "seg3",
"evalue", "lenSeqAverage", "lenCDR3", "lenJunction",
"seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg4_delLeft",
"seg5_stop", "seg3_start", "seg4_stop", "seg4_start", "cdr3_stop", "cdr3_start",
"junction_stop", "junction_start", "productive",
"insert_53", "insert_54", "insert_43",
"evalue", "evalue_left", "evalue_right",
]
GET_UNKNOW_AXIS = "unknow_axis"
UNKNOWN_VALUE = ""
def generic_open(path, mode='r', verbose=False):
'''
Open a file.
If 'r', possibly open .gz compressed files.
'''
if verbose:
print(" <==", path, "\t", end=' ')
if 'r' in mode:
try:
f = gzip.open(path, mode)
f.read(1)
f.seek(0)
return f
except IOError:
pass
return open(path, mode)
class Window:
# Should be renamed "Clone"
'''storage class for sequence informations
with some function to group sequence informations
>>> str(w1)
'<window : [5] 3 aaa>'
>>> str(w3)
'<window : [8] 4 aaa>'
>>> str(w1 + w3)
'<window : [5, 8] 3 aaa>'
check if other information are conserved
>>> (w2 + w4).d["test"]
['', 'plop']
>>> w1.get_values("name")
'?'
>>> w1.get_values("top")
3
>>> w7 = Window(1)
>>> w7.d = seg_w7
>>> w7.d["seg"]["5"]["name"]
'TRAV17*01'
>>> w7.get_values("reads")
16
>>> w7.get_values("top")
26
>>> w7.get_values("germline")
'TRA'
>>> w7.get_values("name")
'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
>>> w7.get_values("seg5")
'TRAV17*01'
>>> w7.get_values("seg4a")
'?'
>>> w7.get_values("seg4")
'?'
>>> w7.get_values("seg4b")
'?'
>>> w7.get_values("seg3")
'TRAJ40*01'
>>> w7.get_values("evalue")
'1.180765e-43'
>>> w7.get_values("evalue_left")
'1.296443e-47'
>>> w7.get_values("evalue_right")
'1.180635e-43'
>>> w7.get_values("seg5_delRight")
0
>>> w7.get_values("seg3_delLeft")
5
>>> w7.get_values("seg4_delRight")
'?'
>>> w7.get_values("seg4_delLeft")
'?'
>>> w7.get_values("seg5_stop")
105
>>> w7.get_values("seg3_start")
112
>>> w7.get_values("seg4_stop")
'?'
>>> w7.get_values("seg4_start")
'?'
>>> w7.get_values("cdr3_start")
96
>>> w7.get_values("cdr3_stop")
133
>>> w7.get_values("junction_start")
93
>>> w7.get_values("junction_stop")
136
>>> w7.get_values("productive")
True
>>> w7.get_values("unavailable_axis")
'unknow_axis'
>>> w7.get_values("lenSeqAverage")
168.0
>>> w7.get_values("lenSeqAverage", 0)
168.0
>>> w7.get_values("lenSeqAverage", 1)
169.0
'''
### init Window with the minimum data required
def __init__(self, size):
self.d={}
self.d["id"] = ""
self.d["top"] = sys.maxsize
self.d["reads"] = []
for i in range(size):
self.d["reads"].append(0)
###
def __iadd__(self, other):
### Not used now
"""Add other.reads to self.reads in-place, without extending lists"""
assert(len(self.d['reads']) == len(other.d['reads']))
self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
return self
def __add__(self, other):
"""Concat two windows, extending lists such as 'reads'"""
#data we don't need to duplicate
myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline", "mrd"]
obj = Window(1)
# 'id' and 'top' will be taken from 'topmost' clone
del obj.d["id"]
del obj.d["top"]
# Data of type 'list'
concatenate_with_padding(obj.d,
self.d, len(self.d["reads"]),
other.d, len(other.d["reads"]),
myList)
# MRD data, only if none is empty
# TODO: Make this more generic to work with any MRD setup
zeroed = {"copy_number": [0],
"R2": [0],
"family": ["None"],
"norm_coeff": [0]}
if "mrd" in self.d or "mrd" in other.d:
if "mrd" in self.d:
first = self.d["mrd"]
else:
first = zeroed
for key in first.keys():
first[key] = first[key] * len(self.d["reads"])
if "mrd" in other.d:
second = other.d["mrd"]
else:
second = zeroed
for key in second.keys():
second[key] = second[key] * len(other.d["reads"])
obj.d["mrd"] = {}
concatenate_with_padding(obj.d["mrd"],
first, len(self.d["reads"]),
second, len(other.d["reads"]))
# All other data, including 'top'
# When there are conflicting keys, keep data from the 'topmost' clone
order = [other, self] if other.d["top"] < self.d["top"] else [self, other]
for source in order:
for key in source.d:
if key not in obj.d:
obj.d[key] = source.d[key]
# !! No name field if fuse with an empty clone
if "name" in self.d and "name" in other.d and self.d["name"] != other.d["name"]:
if obj.d["name"] == self.d["name"]:
name = other.d["name"]
else:
name = self.d["name"]
msg = "Merged clone has different V(D)J designations in some samples (pos %s): %s" % (len(self.d["reads"]), name)
obj.addWarning(code="W81", msg=msg, level="warn")
# !! No name field if fuse with an empty clone
# !! If imported from airr, junction is present but empty
if ("seg" in self.d and "junction" in self.d["seg"]) and ("seg" in other.d and "junction" in other.d["seg"])\
and ("productive" in self.d["seg"]["junction"] and "productive" in other.d["seg"]["junction"]) \
and self.d["seg"]["junction"]["productive"] != other.d["seg"]["junction"]["productive"]:
if obj.d["seg"]["junction"]["productive"] == self.d["seg"]["junction"]["productive"]:
junction = other.d["seg"]["junction"]["productive"]
else:
junction = self.d["seg"]["junction"]["productive"]
if junction:
junction = "productive"
else:
junction = "not productive"
# show number/position of the corresponding sample ?
msg = "Merged clone has different productivities in some samples (pos %s): %s" % (len(self.d["reads"]), junction)
obj.addWarning(code="W82", msg=msg, level="warn")
return obj
def addWarning(self, code, msg, level):
# init warn field if not already present
if not "warn" in self.d:
self.d["warn"] = []
self.d["warn"].append( {"code":code, "msg":msg, "level": level})
return
def get_nb_reads(self, cid, point=0):
return self[cid]["reads"][point]
def get_reads(self, t=-1):
if t== -1:
return self.d["reads"]
else:
return self.d["reads"][t]
def is_segmented(self):
if not "seg" in self.d.keys():
return False
return True
def get_axes_values(self, axes, timepoint):
"""Return the values of given axes for this clone at a gvien timepoint"""
values = []
for axis in axes:
values.append(self.get_values(axis, timepoint))
return values
def get_values(self, axis, timepoint=0):
"""Return the value of an axis for this clone at a given timepoint"""
axes = {
"id": ["id"],
"top": ["top"],
"germline": ["germline"],
"name": ["name"],
"reads": ["reads"],
"seg5": ["seg", "5", "name"],
"seg4": ["seg", "4", "name"],
"seg4a": ["seg", "4a", "name"],
"seg4b": ["seg", "4b", "name"],
"seg3": ["seg", "3", "name"],
"evalue": ["seg", "evalue","val"],
"evalue_left": ["seg", "evalue_left","val"],
"evalue_right": ["seg", "evalue_right","val"],
"seg5_delRight": ["seg","5","delRight"],
"seg3_delLeft": ["seg","3","delLeft"],
"seg4_delRight": ["seg","4","delRight"],
"seg4_delLeft": ["seg","4","delLeft"],
### Positions
"seg5_stop": ["seg","5","stop"],
"seg3_start": ["seg","3","start"],
"seg4_stop": ["seg","4","stop"],
"seg4_start": ["seg","4","start"],
"cdr3_stop": ["seg","cdr3","stop"],
"cdr3_start": ["seg","cdr3","start"],
"junction_stop": ["seg","junction","stop"],
"junction_start": ["seg","junction","start"],
"productive": ["seg","junction","productive"],
"lenSeqAverage" : ["_average_read_length"],
"warnings": ["warn"],
"sequence": ["sequence"]
}
### length
axes_length = {
"insert_53": ["seg3_start", "seg5_stop", -1],
"insert_54": ["seg4_start", "seg5_stop", -1],
"insert_43": ["seg3_start", "seg4_stop", -1],
"lenCDR3": ["cdr3_stop", "cdr3_start", 0],
"lenJunction": ["junction_stop", "junction_start", 0]
}
axes_rounded = {
"lenSeqAverage" : 1.0
}
try:
if axis in axes.keys():
path = axes[axis]
depth = 0
value = self.d
if path[0] == "seg" and not self.is_segmented():
return "?"
while depth != len(path):
value = value[path[depth]]
depth += 1
if axis in axes_rounded.keys():
value = self.round_axis(value, axes_rounded[axis])
if type(value) is list:
# In these cases, should be a list of value at different timepoint
return value[timepoint]
return value
return GET_UNKNOW_AXIS
except:
return "?"
def round_axis(self, value, round_set):
if isinstance(value, list):
for i in range(len(value)):
value[i] = self.round_axis(value[i], round_set)
else:
value = round(float(value) / round_set) * round_set
return value
def toCSV(self, cols= [], time=0, jlist = False):
""" Return a list of values of the clone added from a list of waited columns """
list_values = []
airr_to_axes = {
"locus": "germline",
"v_call": "seg5",
"d_call": "seg4",
"j_call": "seg3",
"v_alignment_end": "seg5_stop",
"d_alignment_start": "seg4_start",
"d_alignment_end": "seg4_stop",
"j_alignment_start": "seg3_start",
"productive": "productive",
"cdr3_start": "cdr3_start",
"cdr3_end": "cdr3_stop",
"sequence_id": "id",
"sequence": "sequence",
## Not implemented in get_values
# For x_cigar, datananot available in clone
"junction": "?",
"cdr3_aa": "?",
"rev_comp": "?",
"v_cigar": "?",
"d_cigar": "?",
"j_cigar": "?",
"sequence_alignment": "?",
"germline_alignment": "?",
"duplicate_count": "reads"
}
airr_computed = ["ratio_segmented", "ratio_locus", "filename", "warnings"]
for col in cols:
if col in airr_computed:
if col == "ratio_locus":
germline = self.get_values("germline")
value = float(self.get_values("reads", timepoint=time) ) / jlist.d["reads"].d["germline"][germline][time]
elif col == "ratio_segmented":
value = float(self.get_values("reads", timepoint=time) ) / jlist.d["reads"].d["segmented"][time]
elif col == "filename":
value = jlist.d["samples"].d["original_names"][time]
elif col == "warnings":
if "warn" not in self.d.keys():
value = ""
else:
warns = self.d["warn"]
values = []
for warn in warns:
values.append( warn["code"] )
value = ",".join(values)
else:
value = GET_UNKNOW_AXIS
elif col in airr_to_axes.keys():
value = self.get_values(airr_to_axes[col], timepoint=time)
else:
value = GET_UNKNOW_AXIS
if value == "?" or value == GET_UNKNOW_AXIS:
# for some axis, there is no computable data; for the moment don't show anything
value = UNKNOWN_VALUE
list_values.append( str(value) )
return list_values
def latex(self, point=0, base_germline=10000, base=10000, tag=''):
reads = self.d["reads"][point]
ratio_germline = float(reads)/base_germline
ratio = float(reads)/base
return r" & & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
self.d["name"] if 'name' in self.d else self.d["id"],
tag, self.d["id"])
### print essential info about Window
def __str__(self):
return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
class PreProcesses:
def __init__(self):
self.d={}
def __add__(self, other):
obj = PreProcesses()
length_self = len(self.d['run_timestamp'])
length_other = len(other.d['run_timestamp'])
concatenate_with_padding(obj.d,
self.d, length_self,
other.d, length_other)
return obj
def __iter__(self):
return self.d.__iter__()
def __getitem__(self, item):
return self.d.__getitem__(item)
def __setitem__(self, item, value):
return self.d.__setitem__(item, value)
class Samples:
def __init__(self):
self.d={}
self.d["number"] = 1
self.d["original_names"] = [""]
def __add__(self, other):
obj=Samples()
concatenate_with_padding(obj.d,
self.d, self.d['number'],
other.d, other.d['number'],
['number', 'pre_process'],
recursive=True)
if "pre_process" in self.d.keys() or "pre_process" in other.d.keys():
# init if needed
if "pre_process" not in self.d.keys():
self.d["pre_process"] = {}
if "pre_process" not in other.d.keys():
other.d["pre_process"] = {}
obj.d["pre_process"] = {}
concatenate_with_padding(obj.d["pre_process"],
self.d["pre_process"], self.d['number'],
other.d["pre_process"], other.d['number'],
[], recursive=True,
none_init=True)
obj.d["number"] = int(self.d["number"]) + int(other.d["number"])
return obj
def __str__(self):
return "<Samples: %s>" % self.d
class MRD:
def __init__(self, number=1):
self.d={}
self.d["number"] = number
def __add__(self, other):
obj=MRD()
concatenate_with_padding(obj.d,
self.d, self.d['number'],
other.d, other.d['number'],
['number'])
obj.d["number"] = int(self.d["number"]) + int(other.d["number"])
return obj
def __str__(self):
return "<MRD: %s>" % self.d
class Diversity:
keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]
def __init__(self, data=None):
self.d={}
for k in self.keys:
if data == None or not (k in data):
self.d[k] = ["na"]
elif isinstance(data[k], list):
self.d[k]= data[k]
else:
self.d[k]= [data[k]]
def __add__(self, other):
for k in self.keys:
self.d[k].append(other.d[k][0])
return self
def __str__(self):
return "<Diversity: %s>" % self.d
class Reads:
def __init__(self):
self.d={}
self.d["total"] = [0]
self.d["segmented"] = [0]
self.d["germline"] = {}
self.d['distribution'] = {}
def __add__(self, other):
obj=Reads()
concatenate_with_padding(obj.d['germline'],
self.d['germline'], len(self.d['total']),
other.d['germline'], len(other.d['total']),
['total'])
concatenate_with_padding(obj.d['distribution'],
self.d['distribution'], len(self.d['total']),
other.d['distribution'], len(other.d['total']),
['total'])
if 'merged' in self.d or 'merged' in other.d:
for mydict in [obj.d, self.d, other.d]:
if 'merged' not in mydict:
mydict['merged'] = [0] * len(mydict['total'])
obj.d['merged'] = self.d['merged'] + other.d['merged']
obj.d["total"] = self.d["total"] + other.d["total"]
obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
return obj
def addAIRRClone(self, clone):
"""
Allow to add a clone create by AIRR import.
Add reads values to germline, segmented and total section
"""
if clone.d["germline"] not in self.d["germline"].keys():
self.d["germline"][clone.d["germline"]] = [0]
self.d["germline"][clone.d["germline"]][0] += clone.d["reads"][0]
self.d["total"][0] += clone.d["reads"][0]
self.d["segmented"][0] += clone.d["reads"][0]
return
def __str__(self):
return "<Reads: %s>" % self.d
class OtherWindows:
"""Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""
def __init__(self, length, ranges = None):
self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
self.length = length
self.sizes = {}
for r in self.ranges:
self.sizes[r] = [0 for i in range(self.length)]
def __iadd__(self, window):
"""Add to self a window. The different points may land into different 'others-' windows."""
for i, s in enumerate(window.d["reads"]):
for r in self.ranges:
if s >= r:
break
self.sizes[r][i] += s
## TODO: add seg_stat
# print window, '-->', self.sizes
return self
def __iter__(self):
"""Yield the 'others-' windows"""
for r in self.ranges:
w = Window(self.length)
w.d['reads'] = self.sizes[r]
w.d['window'] = 'others-%d' % r
w.d['top'] = 0
print(' --[others]-->', w)
yield w
class ListWindows(VidjilJson):
'''storage class for sequences informations
>>> lw1.info()
<ListWindows: [25] 2>
<window : [5] 3 aaa>
<window : [12] 2 bbb>
>>> lw2.info()
<ListWindows: [34] 2>
<window : [8] 4 aaa>
<window : [2] 8 ccc>
>>> lw3 = lw1 + lw2
>>> lw3.info()
<ListWindows: [25, 34] 3>
<window : [12, 0] 2 bbb>
<window : [5, 8] 3 aaa>
<window : [0, 2] 8 ccc>
>>> lw3.build_stat()
>>> lw3.d['reads'].d['distribution']['0.1']
[17, 8]
>>> lw3.d['reads'].d['distribution']['0.01']
[0, 2]
>>> lw3.d['reads'].d['distribution']['0.001']
[0, 0]
>>> lw3.d['reads'].d['distribution']['0.0001']
[0, 0]
'''
def __init__(self):
'''init ListWindows with the minimum data required'''
self.d={}
self.d["samples"] = Samples()
self.d["reads"] = Reads()
self.d["clones"] = []
self.d["clusters"] = []
self.d["germlines"] = {}
self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
self.d["producer"] = FUSE_VERSION
self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def __str__(self):
return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )
# Iterator and access functions
def __iter__(self):
return self.d["clones"].__iter__()
def __getitem__(self, i):
### not efficient !
for clone in self:
if clone.d["id"] == i:
return clone
def __len__(self):
return len(self.d["clones"])
### print info about each Windows stored
def info(self):
print(self)
for clone in self:
print(clone)
### compute statistics about clones
def build_stat(self):
ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
for clone in self:
for i, s in enumerate(clone.d["reads"]):
if self.d['reads'].d['segmented'][i] > 0:
for r in range(len(ranges)):
if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
break
result[r][i] += s
for r in range(len(ranges)):
ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
self.d['reads'].d['distribution'][ratio_in_string] = result[r]
#TODO V/D/J distrib and more
### save / load to .json
def save_json(self, output):
'''save ListWindows in .json format'''
print("==>", output)
with open(output, "w") as f:
json.dump(self, f, indent=2, default=self.toJson)
def load(self, file_path, *args, **kwargs):
if '.clntab' in file_path:
self.load_clntab(file_path, *args, **kwargs)
elif ".tsv" in file_path or ".AIRR" in file_path or ".airr" in file_path:
self.load_airr(file_path, *args, **kwargs)
else:
self.load_vidjil(file_path, *args, **kwargs)
def loads(self, string, *args, **kwargs):
self.loads_vidjil(string, *args, **kwargs)
def init_data(self, data):
self.d=data.d
# Be robust against 'null' values for clones
if not self.d["clones"]:
self.d["clones"] = []
if "diversity" in self.d.keys():
self.d["diversity"] = Diversity(self.d["diversity"])
else:
self.d["diversity"] = Diversity()
if 'distribution' not in self.d['reads'].d:
self.d['reads'].d['distribution'] = {}
self.id_lengths = defaultdict(int)
print("%%")
for clone in self:
self.id_lengths[len(clone.d['id'])] += 1
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
except KeyError:
pass
def load_vidjil(self, file_path, pipeline, verbose=True):
'''init listWindows with data file
Detects and selects the parser according to the file extension.'''
extension = file_path.split('.')[-1]
with generic_open(file_path, "r", verbose) as f:
self.init_data(json.load(f, object_hook=self.toPython))
self.check_version(file_path)
if pipeline:
# renaming, private pipeline
f = '/'.join(file_path.split('/')[2:-1])
if verbose:
print("[%s]" % f)
else:
f = file_path
if verbose:
print()
time = os.path.getmtime(file_path)
self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]*self.d["samples"].d["number"]
def loads_vidjil(self, string, pipeline, verbose=True):
'''init listWindows with a json string'''
self.init_data(json.loads(string, object_hook=self.toPython))
def load_pre_process(self, file_path, verbose = True):
'''init listWindows with the pre_process data file'''
with generic_open(file_path, 'r', verbose) as f:
json_data = json.load(f, object_hook=self.toPython)
self.d['samples'].d['pre_process'] = json_data['pre_process']
self.d['reads'].d['merged'] = json_data['reads'].d['merged']
def getTop(self, top):
result = []
for clone in self:
if clone.d["top"] <= top :
result.append(clone.d["id"])
return result
def filter(self, f):
r = []
reverseList = {}
for i,w in enumerate(self.d["clones"]) :
reverseList[w.d["id"]] = i
#filteredList = { k: reverseList[k] for k in f }
filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
for i in filteredList:
r.append(self.d["clones"][filteredList[i]])
self.d["clones"] = r
###
def __add__(self, other):
'''Combine two ListWindows into a unique ListWindows'''
obj = ListWindows()
l1 = len(self.d["reads"].d['segmented'])
l2 = len(other.d["reads"].d['segmented'])
concatenate_with_padding(obj.d,
self.d, l1,
other.d, l2,
["clones", "links", "germlines",
"vidjil_json_version"])
obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
obj.d["samples"] = self.d["samples"] + other.d["samples"]
obj.d["reads"] = self.d["reads"] + other.d["reads"]
obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
if "mrd" in self.d or "mrd" in other.d:
if not "mrd" in self.d:
self.d["mrd"] = MRD()
if not "mrd" in other.d:
other.d["mrd"] = MRD()
obj.d["mrd"] = self.d["mrd"] + other.d["mrd"]
try:
### Verify that same file is not present twice
filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
for filename in filename_jlist1:
if filename in filename_jlist2:
raise( "Error, duplicate file name (in distributions) ")
### Append distributions of each files
obj.d["distributions"] = {}
obj.d["distributions"]["repertoires"] = {}
obj.d["distributions"]["keys"] = ["clones", "reads"]
obj.d["distributions"]["filters"] = {}
obj.d["distributions"]["categories"] = {}
for filename in filename_jlist1:
obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
for filename in filename_jlist2:
obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
except:
pass
return obj
###
def __mul__(self, other):
for i in range(len(self.d["reads_segmented"])):
self.d["reads_segmented"][i] += other.d["reads_segmented"][i]
self.d["clones"] += other.d["clones"]
self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
self.d["system_segmented"].update(other.d["system_segmented"])
return self
###
def fuseWindows(self, w1, w2, l1, l2) :
#store data in dict with "id" as key
dico1 = {}
for i in range(len(w1)) :
dico1[ w1[i].d["id"] ] = w1[i]
dico2 = {}
for i in range(len(w2)) :
dico2[ w2[i].d["id"] ] = w2[i]
#concat data with the same key ("id")
#if some data are missing we concat with an empty Windows()
dico3 = {}
for key in dico1 :
if key in dico2 :
dico3[key] = dico1[key] + dico2[key]
else :
w=Window(l2)
dico3[key] = dico1[key] + w
for key in dico2 :
if key not in dico1 :
w=Window(l1)
dico3[key] = w + dico2[key]
#sort by top
tab = []
for key in dico3 :
tab.append((dico3[key], dico3[key].d["top"]))
tab = sorted(tab, key=itemgetter(1))
#store back in a List
result=[]
for i in range(len(tab)) :
result.append(tab[i][0])
return result
def cut(self, limit, nb_points):
'''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
w=[]
others = OtherWindows(nb_points)
for clone in self:
if (int(clone.d["top"]) <= limit or limit == 0) :
w.append(clone)
#else:
#others += clone
self.d["clones"] = w #+ list(others)
print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
return self
def load_clntab(self, file_path, *args, **kwargs):
'''Parser for .clntab file'''
self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
self.d["samples"].d["original_names"] = [file_path]
self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
listw = []
listc = []
total_size = 0
fichier = open(file_path,"r")
for ligne in fichier:
if "clonotype" in ligne:
header_map = ligne.replace('\n', '').split('\t')
else :
tab = AccessedDict()
for index, data in enumerate(ligne.split('\t')):
tab[header_map[index]] = data
w=Window(1)
w.d["seg"] = {}
#w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
s = int(tab["sequence.size"])
total_size += s
w.d["reads"] = [ s ]
w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
w.d["sequence"] = tab["sequence.raw nt seq"]
w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
if (tab["sequence.D-GENE and allele"] != "") :
w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
# use sequence.JUNCTION to colorize output (this is not exactly V/J !)
junction = tab.get("sequence.JUNCTION.raw nt seq")
position = w.d["sequence"].find(junction)
if position >= 0:
w.d["seg"]["3start"] = position + len(junction)
w.d["seg"]["5end"] = position
else:
w.d["seg"]["3start"] = 0
w.d["seg"]["5end"] = len(w.d["sequence"])
w.d["seg"]["3end"]=0
w.d["seg"]["5start"]=0
w.d["seg"]["4end"]=0
w.d["seg"]["4start"]=0
w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
listw.append((w , w.d["reads"][0]))
raw_clonotype = tab.get("clonotype")
w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]
clonotype = raw_clonotype.split(' ')
if (len(clonotype) > 1) :
listc.append((w, raw_clonotype))
#keep data that has not already been stored
for header in tab.not_accessed_keys():
w.d["_"+header] = [tab[header]]
#sort by sequence_size
listw = sorted(listw, key=itemgetter(1), reverse=True)
#sort by clonotype
listc = sorted(listc, key=itemgetter(1))
#generate data "top"
for index in range(len(listw)):
listw[index][0].d["top"]=index+1
self.d["clones"].append(listw[index][0])
self.d["reads"].d["segmented"] = [total_size]
self.d["reads"].d["total"] = [total_size]
def load_airr(self, file_path, pipeline, verbose=True):
'''
Parser for AIRR files
format: https://buildmedia.readthedocs.org/media/pdf/airr-standards/stable/airr-standards.pdf
'''
self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
self.d["samples"].d["original_names"] = [file_path]
self.d["samples"].d["producer"] = ["unknown (AIRR format)"]
self.d["samples"].d["log"] = ["Created from an AIRR format. No other information available"]
self.d["reads"].d["germline"] = defaultdict(lambda: [0])
self.d["diversity"] = Diversity()
### Store created clone id
clone_ids = defaultdict(lambda: False)
listw = []
listc = []
total_size = 0
i= 0
import csv
with generic_open(file_path, 'r', verbose) as tsvfile:
reader = csv.DictReader(tsvfile, dialect='excel-tab')
for row in reader:
row = self.airr_rename_cols(row)
i += 1
### bypass igBlast format
if "Total queries" in row["sequence_id"]:
print( "here")
break
w=Window(1)
w.d["seg"] = { "junction":{}, "cdr3":{} }
w.d["id"] = row["sequence_id"]
# controle that no other clone is presetn with this id
p = 1
while clone_ids[w.d["id"]]:
w.d["id"] = row["sequence_id"] + "_%s" % p
p += 1
clone_ids[w.d["id"]] = True
w.d["sequence"] = row["sequence"]
if "duplicate_count" not in row.keys() or row["duplicate_count"] == "":
w.d["reads"] = [1]
else :
w.d["reads"] = [ int(float(row["duplicate_count"])) ]
## 'Regular' columns, translated straight from AIRR into .vidjil
axes = {"locus": ["germline"],
"v_call": ["seg", "5", "name"],
"d_call": ["seg", "4", "name"],
"j_call": ["seg", "3", "name"],
"v_alignment_start": ["seg","5","start"],
"v_alignment_end": ["seg","5","stop"],
"d_alignment_start": ["seg","4","start"],
"d_alignment_end": ["seg","4","stop"],
"j_alignment_start": ["seg","3","start"],
"j_alignment_end": ["seg","3","stop"],
"junction_aa": ["seg","junction","aa"],
"productive": ["seg","junction","productive"],
"cdr1_start": ["seg","cdr1", "start"],
"cdr1_end": ["seg","cdr1", "stop"],
"cdr3_start": ["seg","cdr3", "start"],
"cdr3_end": ["seg","cdr3", "stop"],
"5prime_trimmed_n_nb": ["seg","5","delLeft"],
"3prime_trimmed_n_nb": ["seg","3","delLeft"],
"warnings": ["warn"]}
## Fill .vidjil values with a recursive call
for axe in axes.keys():
if axe in row.keys() and row[axe] != "":
path = axes[axe]
depth = 0
value = w.d
to_put = self.airr_clean_content(axe, row[axe])
while depth != len(path):
cat = path[depth]
if cat not in value.keys():
value[cat] = {}
depth += 1
if depth != len(path):
value = value[cat]
value[cat] = to_put
listw.append((w , w.d["reads"][0]))
##### Special values in .vidjil, recomputed from AIRR data
### NAME (simple, vidjil like)
name = ""
if "v_call" in row.keys() and row["v_call"] != "":
name += (" "*int(name != "")) + w.d["seg"]["5"]["name"]
if "v_call" in row.keys() and row["d_call"] != "":
name += (" "*int(name != "")) + w.d["seg"]["4"]["name"]
if "v_call" in row.keys() and row["j_call"] != "":
name += (" "*int(name != "")) + w.d["seg"]["3"]["name"]
if name == "":
"undetermined segmentation"
w.d["name"] = name
### READS
if not 'germline' in w.d.keys():
w.d["germline"] = "undetermined"
self.d["reads"].addAIRRClone( w )
### Average/coverage
average_read_length = float(len(row["sequence"]))
w.d["_average_read_length"] = [average_read_length]
w.d["_coverage"] = [1.0]
w.d["_coverage_info"] = ["%.1f bp (100%% of %.1f bp)" % (average_read_length, average_read_length )]
### Warning
# Il faut avoir une table pour faire la conversion ?
# TODO: undefined for the moment
#sort by sequence_size
listw = sorted(listw, key=itemgetter(1), reverse=True)
#sort by clonotype
listc = sorted(listc, key=itemgetter(1))
#generate data "top"
for index in range(len(listw)):
listw[index][0].d["top"]=index+1
self.d["clones"].append(listw[index][0])
return
def airr_rename_cols(self, row):
'''Rename columns to homogeneize AIRR data from different software'''
couples = [
## MiXCR
("bestVHit", "v_call"), ("bestDHit", "d_call"), ("bestJHit", "j_call"),
("cloneId", "sequence_id"), ("targetSequences", "sequence"),
("cloneCount", "duplicate_count"), ("chains", "locus")
]
for couple in couples:
if couple[0] in row.keys():
row[couple[1]] = row.pop(couple[0])
return row
def airr_clean_content(self, category, value):
'''
Clean value of inapropriate content, as species names in segment name
'''
cat_boolean = ['productive']
cat_numeric = [
'cdr3_start', 'cdr3_end',
"v_alignment_start", "v_alignment_end",
"d_alignment_start", "d_alignment_end",
"j_alignment_start", "j_alignment_end"
]
# Use in the case of IMGT denomination
if category in ["v_call", "d_call", "j_call"]:
if "," in value:
value = value.split(", or")[0]
# clean specific vquest:
value = value.replace("Homsap ", "")
value = value.replace("Homsap_", "")
value = value.replace(" F", "")
value = value.replace(" (F)", "")
value = value.replace(" [ORF]", "")
## Give vidjil formating of warning
if category == 'warnings':
value = [{"code": value, "level": "warn"}]
## convert boolean value (as igBlast productive field for example)
if category in cat_boolean:
if value == "T" or value == "true":
value = True
elif value == "F" or value == "false":
value = False
## Convert numeric field is given in string format
if category in cat_numeric:
value = int(value)
return value
def toJson(self, obj):
'''Serializer for json module'''
if isinstance(obj, ListWindows) or isinstance(obj, Window)\
or isinstance(obj, Samples) or isinstance(obj, Reads)\
or isinstance(obj, Diversity) or isinstance(obj, MRD)\
or isinstance(obj, PreProcesses):
result = {}
for key in obj.d :
result[key]= obj.d[key]
return result
raise TypeError(repr(obj) + " fail !")
else:
result = {}
for key in obj :
result[key]= obj[key]
return result
raise TypeError(repr(obj) + " fail !")
def toPython(self, obj_dict):
'''Reverse serializer for json module'''
if "samples" in obj_dict:
obj = ListWindows()
obj.d=obj_dict
return obj
if "coefficients" in obj_dict:
obj = MRD()
obj.d=obj_dict
# TODO: make this more generic
obj.d["number"] = len(obj_dict['prevalent'])
return obj
if "id" in obj_dict:
obj = Window(1)
obj.d=obj_dict
return obj
if "total" in obj_dict:
obj = Reads()
obj.d=obj_dict
return obj
if "original_names" in obj_dict:
obj = Samples()
obj.d=obj_dict
return obj
# TODO use a better identifier
if "parameters" in obj_dict:
obj = PreProcesses()
obj.d=obj_dict
return obj
return obj_dict
def save_airr(self, output):
"""
Create an export of content into AIRR file
Columns is hardcoded for the moment.
TODO: add some options to give more flexibility to cols
"""
## Constant header
list_header = ["filename", "locus", "duplicate_count", "v_call", "d_call", "j_call", "sequence_id", "sequence", "productive"]
## not available in this script; but present in AIRR format
list_header += ["junction_aa", "junction", "cdr3_aa", "warnings", "rev_comp", "sequence_alignment", "germline_alignment", "v_cigar", "d_cigar", "j_cigar"]
# ## Specificly asked. Need to be added by server side action on vidjil files
list_header += ["ratio_segmented", "ratio_locus"] #"first_name", "last_name", "birthday", "infos", "sampling_date", "frame"]
fo = open(output, 'w')
fo.write( "\t".join(list_header)+"\n")
for clone in self.d["clones"]:
reads = clone.d["reads"]
for time in range(0, len(reads)):
if reads[time]:
fo.write( "\t".join( clone.toCSV(cols=list_header, time=time, jlist=self) )+"\n")
fo.close()
return
def get_filename_pos(self, filename):
""" filename is the key of distributions repertoires """
return self.d["samples"].d["original_names"].index(filename)
# ========================= #
# Distributions computing #
# ========================= #
def init_distrib(self, list_distrib):
""" Create distributions structures """
self.d["distributions"] = {}
self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
self.d["distributions"]["keys"] = ["clones", "reads"]
self.d["distributions"]["filters"] = []
# Warning, will fail of there are same original_names; fixme
nb_sample = self.d["samples"].d["number"]
# prefill distribution for each file of jlist
for filename in self.d["samples"].d["original_names"]:
for distrib in list_distrib:
self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
return
def compute_distribution(self, list_distrib):
""" Compute the distributions given in list_distrib """
for filename in self.d["samples"].d["original_names"]:
timepoint = self.get_filename_pos(filename)
for clone in self.d["clones"]:
if clone.d["reads"][timepoint] != 0:
for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
distrib = list_distrib[distrib_pos]
clone_values = clone.get_axes_values( distrib, timepoint )
nb_reads = clone.get_reads(self.get_filename_pos(filename))
self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
return
def add_clone_values(self, filename, distrib_pos, values, nb_reads):
""" Add value of a clone to a specific distribution """
obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
return
def recursive_add( self, obj, values, nb_reads):
"""
Add (1, nb_reads) by recursively iterating over values.
For example self.recursive_add(obj, [val1, val2], nb_reads) add (1, nb_reads) to obj[val1][val2]
Return the updated incremented object
"""
if len(values) > 1:
# Should increase for one depth
nvalues = values[1:]
if not obj[values[0]] :
obj[values[0]] = defaultdict(lambda: False)
obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
else:
# last depth
if not obj:
obj = defaultdict(lambda: False)
if not obj[values[0]]:
obj[values[0]] = [0, 0]
obj[values[0]][0] += 1
obj[values[0]][1] += nb_reads
return obj
##########################
### Morisita's index ###
##########################
def computeOverlaps(self):
'''
Compute, for each combinaison of sample, various overlap indexs
'''
nb_sample = self.d["samples"].d["number"]
self.d["overlaps"] = {}
self.d["overlaps"]["morisita"] = []
self.d["overlaps"]["jaccard"] = []
for pos_0 in range(0, nb_sample):
morisita = []
jaccard = []
for pos_1 in range(0, nb_sample):
print( "Overlap: %s vs %s" % (pos_0, pos_1) )
morisita.append( self.computeOverlapMorisita(pos_0, pos_1) )
jaccard.append( self.computeOverlapJaccard(pos_0, pos_1) )
self.d["overlaps"]["morisita"].append( morisita)
self.d["overlaps"]["jaccard"].append( jaccard )
return
def computeOverlapMorisita(self, pos_0, pos_1):
"""
Morisita-Horn similarity index
This index apply to quantitative data.
Allow to evaluate the similarity between different groups, and is not sensible by richness of sampling
Formula: CMH = 2 ∑▒((ai x bi))/((da+db)x(Na x Nb))
da = ∑ai2/Na2
db = ∑bi2/Nb2
Na = Total number of species at site A
Nb = Total number of species at site B
ai = Number of species i at site A
bi = Number of species i at site B
# Values between 0 (completly different communityes) and 1 (maximal similarity).
# 2 groups are similare (poor diversity) if CMH value is superior to 0.5
# and dissemblables if value is under 0,5 (high diversity).
!!! Computed only on present clones (so should be run with `-Y all`)
"""
clones = self.d["clones"]
m = 0
da = 0
db = 0
for clone in clones:
ai = clone.d["reads"][pos_0]
bi = clone.d["reads"][pos_1]
m += (ai * bi)
da += (ai*ai)
db += (bi*bi)
d = (da+db)/2
if m == 0: #if really no shared clones
res = 0
else:
res = round( (m/d), 3)
return res
def computeOverlapJaccard(self, pos_0, pos_1):
"""
Jaccard similarity index
Formula :
I = Nc / (N1 + N2 - Nc)
Nc : number of shared taxons between stations
N1 et N2 : number of taxons present only in stations 1 or 2
This index is from 0 to 1 and take into account only positive relations.
If index increase, an important number of species is present in only one location,
with a poor inter-location biodiversity.
"""
clones = self.d["clones"]
reads = self.d["reads"].d["segmented"]
N1 = self.get_nb_species_of_sample(pos_0)
N2 = self.get_nb_species_of_sample(pos_1)
Nc = 0
for clone in clones:
ai = clone.d["reads"][pos_0]
bi = clone.d["reads"][pos_1]
Nc += bool(ai * bi)
# print( "Nc: %s" % Nc)
if Nc == 0: # if really no shared clones
return 0
I = round( (Nc / (N1 + N2 - Nc)), 3)
return I
def get_nb_species_of_sample(self, pos):
X = 0
for clone in self.d["clones"]:
if clone.d["reads"][pos]:
X += 1
return X
### some data used for test
w1 = Window(1)
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
w2 = Window(1)
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
w3 = Window(1)
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
w4 = Window(1)
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
w5 = Window(1)
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
w6 = Window(1)
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
seg_w7 = {
"germline": "TRA",
"id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
"name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
"reads": [
16
],
"_average_read_length": [
168.468,
168.568,
],
"seg": {
"3": {
"delLeft": 5,
"name": "TRAJ40*01",
"start": 112
},
"5": {
"delRight": 0,
"name": "TRAV17*01",
"stop": 105
},
"N": 6,
"cdr3": {
"aa": "ATDAG#SGTYKYI",
"start": 96,
"stop": 133
},
"evalue": {
"val": "1.180765e-43"
},
"evalue_left": {
"val": "1.296443e-47"
},
"evalue_right": {
"val": "1.180635e-43"
},
"junction": {
"aa": "CQQSYSTPYTF",
"productive": True,
"start": 93,
"stop": 136
},
},
"germline": "TRA",
"_coverage": [
0.973684191703796
],
"_coverage_info": [
"74 bp (97% of 76.0 bp)"
],
"sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
"top": 26
}
lw1 = ListWindows()
lw1.d["timestamp"] = 'ts'
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}, "merged": [null]}', object_hook=lw1.toPython)
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
lw1.d["diversity"] = Diversity()
w7 = Window(1)
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
w8 = Window(1)
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
lw2 = ListWindows()
lw2.d["timestamp"] = 'ts'
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}, "merged": [null]}', object_hook=lw1.toPython)
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
lw2.d["diversity"] = Diversity()
def get_preset_of_distributions():
"""
Return a list of distributions
This is now the list of combinations of any two different axes.
>>> lst = get_preset_of_distributions(); ["seg5", "seg3"] in lst
True
>>> lst = get_preset_of_distributions(); ["seg5", "seg5"] in lst
False
>>> lst = get_preset_of_distributions(); ["seg5", "unknow"] in lst
False
>>> len(lst)
959
"""
lst_axes = AVAILABLE_AXES
LIST_DISTRIBUTIONS = []
for axis1 in lst_axes:
LIST_DISTRIBUTIONS.append([axis1])
for axis2 in lst_axes:
if axis1 != axis2:
LIST_DISTRIBUTIONS.append([axis1, axis2])
return LIST_DISTRIBUTIONS
def exec_command(command, directory, input_file):
'''
Execute the command `command` from the directory
`directory`. The executable must exist in
this directory. No path changes are allowed in `command`.
Returns the output filename (a .vidjil).
'''
# split command
soft = command.split()[0]
args = command[len(soft):]
assert (not os.path.sep in command), "No {} allowed in the command name".format(os.path.sep)
ff = tempfile.NamedTemporaryFile(suffix='.vidjil', delete=False)
basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
command_fullpath = basedir+os.path.sep+directory+os.path.sep+soft
com = '%s %s -i %s -o %s' % (quote(command_fullpath), args, quote(os.path.abspath(input_file)), ff.name)
print("Pre/Post process command: \n%s" % com)
os.system(com)
print()
return ff.name
def main():
print("#", ' '.join(sys.argv))
DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
#### Argument parser (argparse)
parser = argparse.ArgumentParser(description= DESCRIPTION,
epilog='''Example:
python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
formatter_class=argparse.RawTextHelpFormatter)
group_options = parser.add_argument_group() # title='Options and parameters')
group_options.add_argument('--test', action='store_true', help='run self-tests')
group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')
group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
group_options.add_argument('--pre', type=str,help='pre-process program (launched on each input .vidjil file) (needs defs.PRE_PROCESS_DIR). \
Program should take arguments -i/-o for input of vidjil file and output of temporary modified vidjil file.')
group_options.add_argument('--post', type=str,help='post-process program (launched on fused .vidjil file) (needs defs.PRE_PROCESS_DIR). \
Program should take arguments -i/-o for input of vidjil file and output of temporary modified vidjil file.')
group_options.add_argument("--distribution", "-d", action='append', type=str, help='compute the given distribution; callable multiple times')
group_options.add_argument('--distributions-all', '-D', action='store_true', default=False, help='compute a preset of distributions')
group_options.add_argument('--distributions-list', '-l', action='store_true', default=False, help='list the available axes for distributions')
group_options.add_argument('--no-clones', action='store_true', default=False, help='do not output individual clones')
group_options.add_argument('--export-airr', action='store_true', default=False, help='export data in AIRR format.')
group_options.add_argument('--overlaps', action='store_true', default=False, help='Compute overlaps index of repertoires.')
parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
args = parser.parse_args()
if args.test:
import doctest
doctest.testmod(verbose = True)
sys.exit(0)
jlist_fused = None
LIST_DISTRIBUTIONS = []
if args.distributions_list:
print("### Available axes for distributions:\n%s\n" % AVAILABLE_AXES)
if args.distributions_all:
LIST_DISTRIBUTIONS = get_preset_of_distributions()
if args.distribution:
for elt in args.distribution:
axes = elt.split(",")
if axes not in LIST_DISTRIBUTIONS:
LIST_DISTRIBUTIONS.append(axes)
print("### fuse.py -- " + DESCRIPTION)
print()
files = args.file
if args.first:
if len(files) > args.first:
print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
files = files[:args.first]
if args.pre:
print("Pre-processing files...")
pre_processed_files = []
for f in files:
out_name = exec_command(args.pre, PRE_PROCESS_DIR, f)
pre_processed_files.append(out_name)
files = pre_processed_files
#filtre
f = []
if args.ijson:
from vidjilparser import VidjilParser
vparser = VidjilParser()
vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)
for path_name in files:
split_path = path_name.split(',')
pre_path = None
if len(split_path) > 1:
path_name = split_path[0]
pre_path = split_path[1]
else:
path_name = split_path[0]
if args.ijson:
json_clones = vparser.extract(path_name)
clones = json.loads(json_clones)
if clones["clones"] is not None:
f += [c['id'] for c in clones["clones"]]
else:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
if pre_path is not None:
jlist.load_pre_process(pre_path)
f += jlist.getTop(args.top)
f = sorted(set(f))
if args.ijson:
vparser.reset()
vparser.addPrefix('')
vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)
if args.multi:
for path_name in files:
jlist = ListWindows()
if args.ijson:
json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
jlist.build_stat()
print("\t", jlist, end=' ')
if jlist_fused is None:
jlist_fused = jlist
else:
jlist_fused = jlist_fused * jlist
print('\t==> merge to', jlist_fused)
jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
else:
print("### Read and merge input files")
for path_name in files:
split_path = path_name.split(',')
pre_path = None
if len(split_path) > 1:
path_name = split_path[0]
pre_path = split_path[1]
else:
path_name = split_path[0]
jlist = ListWindows()
if args.ijson:
json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
if pre_path is not None:
jlist.load_pre_process(pre_path)
jlist.build_stat()
if len(LIST_DISTRIBUTIONS):
jlist.init_distrib(LIST_DISTRIBUTIONS)
jlist.compute_distribution(LIST_DISTRIBUTIONS)
jlist.filter(f)
print("\t", jlist, end=' ')
# Merge lists
if jlist_fused is None:
jlist_fused = jlist
else:
jlist_fused = jlist_fused + jlist
print('\t==> merge to', jlist_fused)
if args.compress:
print()
print("### Select point names")
l = jlist_fused.d["samples"].d["original_names"]
ll = interesting_substrings(l)
print(" <==", l)
print(" ==>", ll)
jlist_fused.d["samples"].d["names"] = ll
print()
if not args.multi:
jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
print("\t", jlist_fused)
print()
#compute similarity matrix
if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
fasta = ""
for i in range(len(jlist_fused.d["clones"])) :
fasta += ">>" + str(i) + "\n"
fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
fasta_file.write(fasta)
try:
out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
jlist_fused.d["similarity"] = json.loads(out)
except OSError:
print("! failed: %s" % TOOL_SIMILARITY)
finally:
os.unlink(fasta_file.name)
else :
jlist_fused.d["similarity"] = [];
if args.overlaps:
print("### Overlaps index")
jlist_fused.computeOverlaps()
if args.no_clones:
# TODO: do not generate the list of clones in this case
del jlist_fused.d["clones"]
print("### Save merged file")
if args.post:
print("Post-processing files...")
jlist_fused.save_json(args.output)
post_out_name = exec_command(args.post, PRE_PROCESS_DIR, args.output)
# reload post processed file
jlist_fused = ListWindows()
jlist_fused.load(post_out_name, args.pipeline)
jlist_fused.build_stat()
if args.export_airr:
output = args.output.replace(".vidjil", ".airr")
jlist_fused.save_airr(output)
else:
output = args.output
jlist_fused.save_json(args.output)
if __name__ =='__main__':
main()