Skip to content

Commit

Permalink
v1.2.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Yan Gao committed May 28, 2021
1 parent 18d2dc6 commit 9abfea1
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 17 deletions.
8 changes: 8 additions & 0 deletions HOXD70.mtx
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# all five kinds of bases need to be included
# do not forget to set gap-open/extension penalty with -O/-E
A C G T N
A 91 -114 -31 -123 0
C -114 100 -125 -31 0
G -31 -125 100 -114 0
T -123 -31 -144 91 0
N 0 0 0 0 0
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@
[![Build Status](https://img.shields.io/travis/yangao07/abPOA/master.svg?label=Master)](https://travis-ci.org/yangao07/abPOA)
[![License](https://img.shields.io/badge/License-MIT-black.svg)](https://github.com/yangao07/abPOA/blob/master/LICENSE)
<!-- [![PyPI](https://img.shields.io/pypi/v/pyabpoa.svg?style=flat)](https://pypi.python.org/pypi/pyabpoa) -->
## Updates (v1.2.1)
## Updates (v1.2.2)

- Add minimizer-based seeding, reduce memory usage for long input sequences
- Remove redundant topological sorting
- Allow scoring matrix file as input

## Getting started
Download the [latest release](https://github.com/yangao07/abPOA/releases):
```
wget https://github.com/yangao07/abPOA/releases/download/v1.2.1/abPOA-v1.2.1.tar.gz
tar -zxvf abPOA-v1.2.1.tar.gz && cd abPOA-v1.2.1
wget https://github.com/yangao07/abPOA/releases/download/v1.2.2/abPOA-v1.2.2.tar.gz
tar -zxvf abPOA-v1.2.2.tar.gz && cd abPOA-v1.2.2
```
Make from source and run with test data:
```
Expand Down Expand Up @@ -81,9 +80,9 @@ You can also build abPOA from source files.
Make sure you have gcc (>=6.4.0) and zlib installed before compiling.
It is recommended to download the [latest release](https://github.com/yangao07/abPOA/releases).
```
wget https://github.com/yangao07/abPOA/releases/download/v1.2.1/abPOA-v1.2.1.tar.gz
tar -zxvf abPOA-v1.2.1.tar.gz
cd abPOA-v1.2.1; make
wget https://github.com/yangao07/abPOA/releases/download/v1.2.2/abPOA-v1.2.2.tar.gz
tar -zxvf abPOA-v1.2.2.tar.gz
cd abPOA-v1.2.2; make
```
Or, you can use `git clone` command to download the source code.
This gives you the latest version of abPOA, which might be still under development.
Expand All @@ -95,8 +94,8 @@ cd abPOA; make
### <a name="binary"></a>Pre-built binary executable file for Linux/Unix
If you meet any compiling issue, please try the pre-built binary file:
```
wget https://github.com/yangao07/abPOA/releases/download/v1.2.1/abPOA-v1.2.1_x64-linux.tar.gz
tar -zxvf abPOA-v1.2.1_x64-linux.tar.gz
wget https://github.com/yangao07/abPOA/releases/download/v1.2.2/abPOA-v1.2.2_x64-linux.tar.gz
tar -zxvf abPOA-v1.2.2_x64-linux.tar.gz
```

## <a name="usage"></a>General usage
Expand Down Expand Up @@ -153,6 +152,8 @@ Options:
0: global, 1: local, 2: extension
-M --match INT match score [2]
-X --mismatch INT mismatch penalty [4]
-t --matrix FILE scoring matrix file, '-M' and '-X' are not used when '-t' is used [NULL]
e.g., 'HOXD70.mtx'
-O --gap-open INT(,INT) gap opening penalty (O1,O2) [4,24]
-E --gap-ext INT(,INT) gap extension penalty (E1,E2) [2,1]
abPOA provides three gap penalty modes, cost of a g-long gap:
Expand Down
2 changes: 2 additions & 0 deletions include/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ typedef struct {

typedef struct {
int m; int *mat; // score matrix
int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
int match, mismatch, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
// minimizer seeding parameter
int k, w, min_w;
Expand Down Expand Up @@ -120,6 +121,7 @@ typedef struct {

// init for abpoa parameters
abpoa_para_t *abpoa_init_para(void);
void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mtx_fn);
void abpoa_post_set_para(abpoa_para_t *abpt);
void abpoa_free_para(abpoa_para_t *abpt);

Expand Down
2 changes: 2 additions & 0 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ This constructs a multiple sequence alignment handler of pyabpoa, it accepts the

* **aln_mode**: alignment mode. 'g': global, 'l': local, 'e': extension; default: **'g'**
* **match**: match score; default: **2**
* **mismatch**: match penaty; default: **4**
* **score_matrix**: scoring matrix file, **match** and **mismatch** are not used when **score_matrix** is used; default: **''**
* **gap_open1**: first gap opening penalty; default: **4**
* **gap_ext1**: first gap extension penalty; default: **2**
* **gap_open2**: second gap opening penalty; default: **24**
Expand Down
2 changes: 2 additions & 0 deletions python/cabpoa.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ cdef extern from "abpoa.h":
ctypedef struct abpoa_para_t:
int m
int *mat # score matrix
int use_score_matrix
int match, mismatch, gap_open1, gap_open2, gap_ext1, gap_ext2
int inf_min
int k, w, min_w
Expand Down Expand Up @@ -112,6 +113,7 @@ cdef extern from "abpoa.h":

# init for abpoa parameters
abpoa_para_t *abpoa_init_para()
void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mtx_fn)
void abpoa_post_set_para(abpoa_para_t *abpt)
void abpoa_free_para(abpoa_para_t *abpt)

Expand Down
13 changes: 9 additions & 4 deletions python/pyabpoa.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ cdef class msa_aligner:
cdef abpoa_para_t abpt
cdef seq_nt4_dict, nt4_seq_dict

def __cinit__(self, aln_mode='g', match=2, mismatch=4, gap_open1=4, gap_open2=24, gap_ext1=2, gap_ext2=1,
def __cinit__(self, aln_mode='g', match=2, mismatch=4, score_matrix='', gap_open1=4, gap_open2=24, gap_ext1=2, gap_ext2=1,
extra_b=10, extra_f=0.01, end_bonus=-1, zdrop=-1, cons_agrm=ABPOA_HB, is_diploid=0, min_freq=0.3):
self.ab = abpoa_init()

Expand All @@ -65,15 +65,20 @@ cdef class msa_aligner:
sys.exit(1)
self.abpt.match = match
self.abpt.mismatch = mismatch
self.abpt.m = 5
self.abpt.mat = <int*>malloc(25 * cython.sizeof(int))

if score_matrix != '':
self.abpt.use_score_matrix = 1
if isinstance(score_matrix, str): score_matrix = bytes(score_matrix, 'utf-8')
abpoa_set_mat_from_file(&self.abpt, score_matrix)
else: self.abpt.use_score_matrix = 0
self.abpt.gap_open1 = gap_open1
self.abpt.gap_open2 = gap_open2
self.abpt.gap_ext1 = gap_ext1
self.abpt.gap_ext2 = gap_ext2
self.abpt.ret_cigar = 1

self.abpt.m = 5
self.abpt.mat = <int*>malloc(25 * cython.sizeof(int))

self.abpt.wb = extra_b
self.abpt.wf = extra_f
self.abpt.end_bonus = end_bonus
Expand Down
8 changes: 6 additions & 2 deletions src/abpoa.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ char PROG[20] = "abpoa";
#define _bO BOLD UNDERLINE "O" NONE
#define _bA BOLD UNDERLINE "A" NONE
char DESCRIPTION[100] = _ba "daptive " _bb "anded " _bP "artial " _bO "rder " _bA "lignment";
char VERSION[20] = "1.2.1";
char VERSION[20] = "1.2.2";
char CONTACT[30] = "gaoy286@mail.sysu.edu.cn";

const struct option abpoa_long_opt [] = {
{ "align-mode", 1, NULL, 'm' },

{ "match", 1, NULL, 'M' },
{ "mismatch", 1, NULL, 'X' },
{ "matrix", 1, NULL, 't' },
{ "gap-open", 1, NULL, 'O' },
{ "gap-ext", 1, NULL, 'E' },

Expand Down Expand Up @@ -70,6 +71,8 @@ int abpoa_usage(void)
err_printf(" %d: global, %d: local, %d: extension\n", ABPOA_GLOBAL_MODE, ABPOA_LOCAL_MODE, ABPOA_EXTEND_MODE);
err_printf(" -M --match INT match score [%d]\n", ABPOA_MATCH);
err_printf(" -X --mismatch INT mismatch penalty [%d]\n", ABPOA_MISMATCH);
err_printf(" -t --matrix FILE scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [NULL]\n");
err_printf(" e.g., \'HOXD70.mtx\'\n");
err_printf(" -O --gap-open INT(,INT) gap opening penalty (O1,O2) [%d,%d]\n", ABPOA_GAP_OPEN1, ABPOA_GAP_OPEN2);
err_printf(" -E --gap-ext INT(,INT) gap extension penalty (E1,E2) [%d,%d]\n", ABPOA_GAP_EXT1, ABPOA_GAP_EXT2);
err_printf(" %s provides three gap penalty modes, cost of a g-long gap:\n", NAME);
Expand Down Expand Up @@ -152,7 +155,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){

int main(int argc, char **argv) {
int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para();
while ((c = getopt_long(argc, argv, "m:M:X:O:E:b:f:z:e:Nk:w:n:i:lpso:Ar:g:a:dq:hv", abpoa_long_opt, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:Nk:w:n:i:lpso:Ar:g:a:dq:hv", abpoa_long_opt, NULL)) >= 0) {
switch(c)
{
case 'm': m = atoi(optarg);
Expand All @@ -161,6 +164,7 @@ int main(int argc, char **argv) {
} abpt->align_mode=m; break;
case 'M': abpt->match = atoi(optarg); break;
case 'X': abpt->mismatch = atoi(optarg); break;
case 't': abpt->use_score_matrix = 1; abpoa_set_mat_from_file(abpt, optarg); break;
case 'O': abpt->gap_open1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_open2 = strtol(s+1, &s, 10); break;
case 'E': abpt->gap_ext1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_ext2 = strtol(s+1, &s, 10); break;

Expand Down
2 changes: 2 additions & 0 deletions src/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ typedef struct {

typedef struct {
int m; int *mat; // score matrix
int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
int match, mismatch, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
// minimizer seeding parameter
int k, w, min_w;
Expand Down Expand Up @@ -120,6 +121,7 @@ typedef struct {

// init for abpoa parameters
abpoa_para_t *abpoa_init_para(void);
void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mtx_fn);
void abpoa_post_set_para(abpoa_para_t *abpt);
void abpoa_free_para(abpoa_para_t *abpt);

Expand Down
56 changes: 55 additions & 1 deletion src/abpoa_align.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,59 @@ void gen_simple_mat(int m, int *mat, int match, int mismatch) {
mat[(m - 1) * m + j] = 0;
}

void parse_mat_first_line(char *l, int *order) {
int i, n;
for (i = n = 0; l[i]; ++i) {
if (isspace(l[i])) continue;
if (l[i] == 'A' || l[i] == 'a') order[n++] = 0;
else if (l[i] == 'C' || l[i] == 'c') order[n++] = 1;
else if (l[i] == 'G' || l[i] == 'g') order[n++] = 2;
else if (l[i] == 'T' || l[i] == 't') order[n++] = 3;
else if (l[i] == 'N' || l[i] == 'n') order[n++] = 4;
else {
err_fatal(__func__, "Unknown base: \"%c\"\n", l[i]);
}
}
}

extern char nt4_table[256];
void parse_mat_score_line(char *l, int *order, int m, int *mat) {
int n, is_base=1, _i=-1; long s; char *str = l, *pEnd=NULL;
for (n = 0; *str; ++str) {
if (!isalpha(*str) && !isdigit(*str) && *str != '+' && *str != '-') continue;
if (is_base) { // get base
_i = nt4_table[(int)*str];
if (_i >= m) err_fatal(__func__, "Unknown base: \"%c\" (%d).\n", *str, _i);
is_base = 0;
} else { // get score
if (n == m) err_fatal_simple("Too many scores in matrix.\n");
s = strtol(str, &pEnd, 10);
str = pEnd;
mat[_i *m + order[n]] = s;
n++;
}
}
}

void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mtx_fn) {
char *l = (char*)_err_malloc(1024 * sizeof(char)); FILE *fp;
if ((fp = fopen(mtx_fn, "r")) == NULL) err_fatal(__func__, "Unable to open scoring matrix file: \"%s\"\n", mtx_fn);
int first_line = 1;
int *order = (int*)_err_malloc(5 * sizeof(int));
while (fgets(l, 1024, fp) != NULL) {
if (l[0] == '#') continue;
if (first_line) {
first_line = 0;
// get A/C/G/T/N bases
parse_mat_first_line(l, order);
} else {
// get match/mismatch scores
parse_mat_score_line(l, order, abpt->m, abpt->mat);
}
}
free(l); free(order); fclose(fp);
}

void abpoa_set_gap_mode(abpoa_para_t *abpt) {
if (abpt->gap_open1 == 0) abpt->gap_mode = ABPOA_LINEAR_GAP;
else if (abpt->gap_open1 > 0 && abpt->gap_open2 == 0) abpt->gap_mode = ABPOA_AFFINE_GAP;
Expand Down Expand Up @@ -54,6 +107,7 @@ abpoa_para_t *abpoa_init_para(void) {
abpt->mat = (int*)_err_malloc(abpt->m * abpt->m * sizeof(int));

// score matrix
abpt->use_score_matrix = 0;
abpt->match = ABPOA_MATCH;
abpt->mismatch = ABPOA_MISMATCH;
abpt->gap_open1 = ABPOA_GAP_OPEN1;
Expand All @@ -73,7 +127,7 @@ abpoa_para_t *abpoa_init_para(void) {
}

void abpoa_post_set_para(abpoa_para_t *abpt) {
gen_simple_mat(abpt->m, abpt->mat, abpt->match, abpt->mismatch);
if (abpt->use_score_matrix == 0) gen_simple_mat(abpt->m, abpt->mat, abpt->match, abpt->mismatch);
abpoa_set_gap_mode(abpt);
if (abpt->cons_agrm == ABPOA_HC || abpt->out_msa || abpt->out_gfa || abpt->is_diploid) {
abpt->use_read_ids = 1;
Expand Down

0 comments on commit 9abfea1

Please sign in to comment.