Skip to content

Commit

Permalink
Optionally output list of matching pairs (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
torognes committed Jun 17, 2021
1 parent e14b4f4 commit de881ea
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 48 deletions.
18 changes: 13 additions & 5 deletions src/cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,20 +353,28 @@ void cluster(char * filename)

uint64_t j = 0;
progress_init("Writing clusters: ", seqcount);
fprintf(outfile,
"#cluster_no\tcluster_size\trepertoire_id\tsequence_id\t"
"duplicate_count\tv_call\tj_call\tjunction_aa\n");
for(unsigned int i = 0; i < clustercount; i++)
{
unsigned int seed = clusterinfo[i].seed;
unsigned int size = clusterinfo[i].size;
for(unsigned int a = seed; a != no_cluster; a = iteminfo[a].next)
{
fprintf(outfile, "%u\t%u\t", i + 1, size);
db_fprint_sequence(outfile, d, a);
fprintf(outfile,
"\t%" PRIu64 "\t%s\t%s\t%s\n",
"%u\t%u\t",
i + 1,
size);
fprintf(outfile,
"%s\t%s\t%" PRIu64 "\t%s\t%s\t",
db_get_repertoire_id(d, db_get_repertoire_id_no(d, a)),
db_get_sequence_id(d, a),
db_get_count(d, a),
db_get_v_gene_name(d, a),
db_get_j_gene_name(d, a),
db_get_repertoire_id(d, db_get_repertoire_id_no(d, a)));
db_get_j_gene_name(d, a));
db_fprint_sequence(outfile, d, a);
fprintf(outfile, "\n");
j++;
}
progress_update(j);
Expand Down
44 changes: 36 additions & 8 deletions src/compairr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ bool opt_ignore_genes;
bool opt_indels;
bool opt_matrix;
bool opt_version;
char * opt_pairs;
char * opt_log;
char * opt_output;
int64_t opt_differences;
Expand All @@ -63,8 +64,7 @@ int64_t opt_threads;

FILE * outfile = nullptr;
FILE * logfile = nullptr;
FILE * fp_seeds = nullptr;
FILE * network_file = nullptr;
FILE * pairsfile = nullptr;

static char dash[] = "-";
static char * DASH_FILENAME = dash;
Expand Down Expand Up @@ -104,7 +104,7 @@ void show_time(const char * prompt)

void args_show()
{
fprintf(logfile, "Command: %s\n", opt_cluster ? "Cluster" : "Overlap");
fprintf(logfile, "Command (m/c): %s\n", opt_cluster ? "Cluster" : "Overlap");
fprintf(logfile, "Repertoire set 1: %s\n", input1_filename);
if (opt_matrix)
fprintf(logfile, "Repertoire set 2: %s\n", input2_filename ? input2_filename : "(same as set 1)");
Expand All @@ -114,9 +114,13 @@ void args_show()
opt_ignore_counts ? "Yes" : "No");
fprintf(logfile, "Ignore genes (g): %s\n",
opt_ignore_genes ? "Yes" : "No");
fprintf(logfile, "Threads (t): %" PRId64 "\n", opt_threads);
if (opt_matrix)
fprintf(logfile, "Output format (a): %s\n", opt_alternative ? "Column" : "Matrix");
fprintf(logfile, "Output file (o): %s\n", opt_output);
fprintf(logfile, "Output format (a): %s\n", opt_alternative ? "Column" : "Matrix");
fprintf(logfile, "Threads: (t) %" PRId64 "\n", opt_threads);
if (opt_matrix)
fprintf(logfile, "Pairs file (p): %s\n", opt_pairs ? opt_pairs : "(none)");
fprintf(logfile, "Log file (l): %s\n", opt_log ? opt_log : "(stderr)");
}

void args_usage()
Expand All @@ -138,6 +142,7 @@ void args_usage()
fprintf(stderr, "\n");
fprintf(stderr, "Input/output options:\n");
fprintf(stderr, " -a, --alternative output overlap results in column format\n");
fprintf(stderr, " -p, --pairs FILENAME output matching pairs to file (none)\n");
fprintf(stderr, " -l, --log FILENAME log to file (stderr)\n");
fprintf(stderr, " -o, --output FILENAME output results to file (stdout)\n");
fprintf(stderr, "\n");
Expand All @@ -157,7 +162,7 @@ void args_init(int argc, char **argv)
progname = argv[0];
input1_filename = 0;
input2_filename = 0;

opt_pairs = 0;
opt_alternative = 0;
opt_differences = 0;
opt_ignore_counts = 0;
Expand All @@ -171,9 +176,9 @@ void args_init(int argc, char **argv)

opterr = 1;

char short_options[] = "acd:fghil:mo:t:v";
char short_options[] = "acd:fghil:mo:p:t:v";

/* unused short option letters: bejknpqrsuwxyz */
/* unused short option letters: bejknqrsuwxyz */

static struct option long_options[] =
{
Expand All @@ -189,6 +194,7 @@ void args_init(int argc, char **argv)
{"output", required_argument, nullptr, 'o' },
{"threads", required_argument, nullptr, 't' },
{"version", no_argument, nullptr, 'v' },
{"pairs", required_argument, nullptr, 'p' },
{nullptr, 0, nullptr, 0 }
};

Expand Down Expand Up @@ -281,6 +287,11 @@ void args_init(int argc, char **argv)
opt_output = optarg;
break;

case 'p':
/* pairs-file */
opt_pairs = optarg;
break;

case 't':
/* threads */
opt_threads = args_long(optarg, "-t or --threads");
Expand Down Expand Up @@ -350,6 +361,14 @@ void args_init(int argc, char **argv)

if (opt_indels && (opt_differences != 1))
fatal("Indels are only allowed when d=1");

if (opt_cluster)
{
if (opt_pairs)
fatal("Option -p or --pairs is not allowed with -c or --cluster");
if (opt_alternative)
fatal("Option -a or --alternative is not allowed with -c or --cluster");
}
}

void open_files()
Expand All @@ -367,10 +386,19 @@ void open_files()
if (! outfile)
fatal("Unable to open output file for writing.");

if (opt_pairs)
{
pairsfile = fopen_output(opt_pairs);
if (! pairsfile)
fatal("Unable to open pairs file for writing.");
}
}

void close_files()
{
if (pairsfile)
fclose(pairsfile);

if (outfile)
fclose(outfile);

Expand Down
4 changes: 3 additions & 1 deletion src/compairr.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ static_assert(INT_MAX > 32767, "Your compiler uses very short integers.");
#define PROG_CMD "compairr"
#define PROG_NAME "CompAIRR"
#define PROG_VERSION "1.1.0"
#define PROG_BRIEF "Compare Adaptive Immune Receptor Repertoires"
#define PROG_BRIEF "Comparison of Adaptive Immune Receptor Repertoires"

const unsigned int MAX_THREADS = 256;

Expand All @@ -125,13 +125,15 @@ extern bool opt_ignore_genes;
extern bool opt_indels;
extern bool opt_matrix;
extern bool opt_version;
extern char * opt_pairs;
extern char * opt_log;
extern char * opt_output_file;
extern int64_t opt_differences;
extern int64_t opt_threads;

extern FILE * outfile;
extern FILE * logfile;
extern FILE * pairsfile;

/* header files */

Expand Down
85 changes: 62 additions & 23 deletions src/db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,18 @@ static signed char map_aa[256] =
};

const static char * aa_chars = "ACDEFGHIKLMNPQRSTVWY";
const char * EMPTYSTRING = "";

struct seqinfo_s
{
uint64_t hash;
char * seq;
short seqlen;
unsigned int repertoire_id_no;
short v_gene_no;
short j_gene_no;
uint64_t count;
char * sequence_id;
char * seq;
unsigned short seqlen;
unsigned short repertoire_id_no;
unsigned short v_gene_no;
unsigned short j_gene_no;
};

typedef struct seqinfo_s seqinfo_t;
Expand All @@ -83,6 +85,13 @@ static char * * j_gene_list = 0;
static uint64_t j_gene_alloc = 0;
static uint64_t j_gene_count = 0;

static int col_junction_aa = 0;
static int col_duplicate_count = 0;
static int col_v_call = 0;
static int col_j_call = 0;
static int col_repertoire_id = 0;
static int col_sequence_id = 0;

void db_init()
{
v_gene_list = 0;
Expand Down Expand Up @@ -121,6 +130,7 @@ void db_exit()
uint64_t list_insert(char * * * list,
uint64_t * alloc,
uint64_t * count,
uint64_t max,
char * item)
{
/* linear list */
Expand All @@ -132,6 +142,11 @@ uint64_t list_insert(char * * * list,
return i;

/* expand if necessary */
if (c >= max)
{
fatal("Too many items (V genes, J genes, or repertoire ID's)");
}

if (c >= *alloc)
{
*alloc += 256;
Expand Down Expand Up @@ -176,12 +191,6 @@ struct db * db_create()
return d;
}

static int col_junction_aa = 0;
static int col_duplicate_count = 0;
static int col_v_call = 0;
static int col_j_call = 0;
static int col_repertoire_id = 0;

void parse_airr_tsv_header(char * line)
{
char delim[] = "\t";
Expand All @@ -190,7 +199,7 @@ void parse_airr_tsv_header(char * line)

int i = 1;

while ((token = strsep(& string, delim)) != NULL)
while ((token = strsep(& string, delim)) != nullptr)
{
if (strcmp(token, "junction_aa") == 0)
{
Expand All @@ -212,6 +221,10 @@ void parse_airr_tsv_header(char * line)
{
col_repertoire_id = i;
}
else if (strcmp(token, "sequence_id") == 0)
{
col_sequence_id = i;
}
i++;
}

Expand Down Expand Up @@ -240,19 +253,20 @@ void parse_airr_tsv_header(char * line)

void parse_airr_tsv_line(char * line, uint64_t lineno, struct db * d)
{
char * junction_aa = NULL;
char * duplicate_count = NULL;
char * v_call = NULL;
char * j_call = NULL;
char * repertoire_id = NULL;
char * junction_aa = nullptr;
char * duplicate_count = nullptr;
char * v_call = nullptr;
char * j_call = nullptr;
char * repertoire_id = nullptr;
char * sequence_id = nullptr;

char delim[] = "\t";
char * string = line;
char * token = nullptr;

int i = 1;

while ((token = strsep(& string, delim)) != NULL)
while ((token = strsep(& string, delim)) != nullptr)
{
if (i == col_junction_aa)
{
Expand All @@ -274,10 +288,14 @@ void parse_airr_tsv_line(char * line, uint64_t lineno, struct db * d)
{
repertoire_id = token;
}
else if (i == col_sequence_id)
{
sequence_id = token;
}
i++;
}

/* check that all values are read */
/* check that all required values are read */

if (! (junction_aa && duplicate_count && v_call && j_call && repertoire_id))
{
Expand All @@ -291,7 +309,7 @@ void parse_airr_tsv_line(char * line, uint64_t lineno, struct db * d)
exit(1);
}

/* check that strings are not empty */
/* check that required strings are not empty */

if (! (*junction_aa && *duplicate_count && *v_call && *j_call &&
*repertoire_id))
Expand Down Expand Up @@ -374,17 +392,29 @@ void parse_airr_tsv_line(char * line, uint64_t lineno, struct db * d)
uint64_t v_gene_index = list_insert(& v_gene_list,
& v_gene_alloc,
& v_gene_count,
USHRT_MAX,
v_call);

uint64_t j_gene_index = list_insert(& j_gene_list,
& j_gene_alloc,
& j_gene_count,
USHRT_MAX,
j_call);

uint64_t repertoire_id_index = list_insert(& d->repertoire_id_list,
& d->repertoire_id_alloc,
& d->repertoire_count,
repertoire_id);
& d->repertoire_id_alloc,
& d->repertoire_count,
UINT_MAX,
repertoire_id);

if (sequence_id && (*sequence_id))
{
p->sequence_id = xstrdup(sequence_id);
}
else
{
p->sequence_id = nullptr;
}

p->seqlen = seqlen;
p->repertoire_id_no = repertoire_id_index;
Expand Down Expand Up @@ -654,6 +684,15 @@ char * db_get_repertoire_id(struct db * d, uint64_t repertoire_id_no)
return d->repertoire_id_list[repertoire_id_no];
}

char * db_get_sequence_id(struct db * d, uint64_t seqno)
{
char * sid = d->seqindex[seqno].sequence_id;
if (sid)
return sid;
else
return (char *) EMPTYSTRING;
}

uint64_t db_get_v_gene_count()
{
return v_gene_count;
Expand Down
2 changes: 2 additions & 0 deletions src/db.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ uint64_t db_get_repertoire_id_no(struct db * d, uint64_t seqno);

char * db_get_repertoire_id(struct db * d, uint64_t repertoire_id_no);

char * db_get_sequence_id(struct db * d, uint64_t seqno);

void db_hash(struct db * d);

uint64_t db_get_v_gene_count();
Expand Down
Loading

0 comments on commit de881ea

Please sign in to comment.