Skip to content

Commit 12759e4

Browse files
committed
tests are passing with speed-up improvements, a few improvements are left out to make test pass for now
1 parent 952ce61 commit 12759e4

File tree

3 files changed

+69
-48
lines changed

3 files changed

+69
-48
lines changed

htslib/vcf.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,8 +1605,9 @@ static inline int bcf_enc_size(kstring_t *s, size_t size, int type)
16051605
s->l += 6;
16061606
}
16071607
else{
1608-
*p++ = 1<<4|BCF_BT_INT64;
1609-
s->l += 10;
1608+
*p++ = 1<<4|BCF_BT_INT64;//
1609+
i64_to_le(size,p);
1610+
s->l += 10; // not so sure about +10 here, whether it is accurate or changes anything.
16101611
return -1;
16111612
}
16121613
}

synced_bcf_reader.c

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,7 +1048,6 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
10481048
kstring_t tmp = {0,0,0};
10491049
const char *sp = str, *ep = str;
10501050
hts_pos_t from, to;
1051-
unsigned char inside_quotes = 0;
10521051
while ( 1 )
10531052
{
10541053
tmp.l = 0;
@@ -1065,28 +1064,8 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
10651064
}
10661065
else
10671066
{
1068-
//A quote is seen, flip flag inside_quotes
1069-
if(*ep == '"')
1070-
{
1071-
inside_quotes = 1 ^ inside_quotes;
1072-
sp = ++ep;
1073-
}
1074-
while ( *ep && ((inside_quotes && *ep!='"') || (!inside_quotes && *ep!=',' && *ep!=':')) ) ep++;
1075-
tmp.l = 0;
1076-
kputsn(sp,ep-sp,&tmp);
1077-
if(inside_quotes)
1078-
{
1079-
if(*ep == '"')
1080-
{
1081-
inside_quotes = 0;
1082-
++ep;
1083-
}
1084-
else
1085-
{
1086-
fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s - terminating \" missing\n", __FILE__,__LINE__,__FUNCTION__,str);
1087-
free(reg); free(tmp.s); return NULL;
1088-
}
1089-
}
1067+
while ( *ep && *ep!=',' && *ep!=':' ) ep++;
1068+
kputsn(sp,ep-sp,&tmp);
10901069
}
10911070
if ( *ep==':' )
10921071
{

vcf.c

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2621,29 +2621,68 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) {
26212621
}
26222622
#endif
26232623

2624-
int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)
2624+
int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)//supposed to be optimized, not working yet.
26252625
{
26262626
int64_t max = INT64_MIN, min = INT64_MAX;
26272627
int i;
2628-
if (n <= 0) return bcf_enc_size(s, 0, BCF_BT_NULL);
2629-
else if (n == 1) return bcf_enc_long1(s, a[0]);
2630-
else {
2628+
if (n <= 0) {
2629+
return bcf_enc_size(s, 0, BCF_BT_NULL);
2630+
} else if (n == 1) {
2631+
return bcf_enc_long1(s, a[0]);
2632+
} else {
26312633
if (wsize <= 0) wsize = n;
2632-
for (i = 0; i < n; ++i) {
2633-
if (a[i] == bcf_int64_missing || a[i] == bcf_int64_vector_end ) continue;
2634+
2635+
// Equivalent to:
2636+
// for (i = 0; i < n; ++i) {
2637+
// if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2638+
// continue;
2639+
// if (max < a[i]) max = a[i];
2640+
// if (min > a[i]) min = a[i];
2641+
// }
2642+
int64_t max4[4] = {INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN};
2643+
int64_t min4[4] = {INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX};
2644+
for (i = 0; i < (n&~3); i+=4) {
2645+
// bcf_int32_missing == INT32_MIN and
2646+
// bcf_int32_vector_end == INT32_MIN+1.
2647+
// We skip these, but can mostly avoid explicit checking
2648+
if (max4[0] < a[i+0]) max4[0] = a[i+0];
2649+
if (max4[1] < a[i+1]) max4[1] = a[i+1];
2650+
if (max4[2] < a[i+2]) max4[2] = a[i+2];
2651+
if (max4[3] < a[i+3]) max4[3] = a[i+3];
2652+
if (min4[0] > a[i+0] && a[i+0] > INT64_MIN+1) min4[0] = a[i+0];
2653+
if (min4[1] > a[i+1] && a[i+1] > INT64_MIN+1) min4[1] = a[i+1];
2654+
if (min4[2] > a[i+2] && a[i+2] > INT64_MIN+1) min4[2] = a[i+2];
2655+
if (min4[3] > a[i+3] && a[i+3] > INT64_MIN+1) min4[3] = a[i+3];
2656+
}
2657+
min = min4[0];
2658+
if (min > min4[1]) min = min4[1];
2659+
if (min > min4[2]) min = min4[2];
2660+
if (min > min4[3]) min = min4[3];
2661+
max = max4[0];
2662+
if (max < max4[1]) max = max4[1];
2663+
if (max < max4[2]) max = max4[2];
2664+
if (max < max4[3]) max = max4[3];
2665+
for (; i < n; ++i) {
26342666
if (max < a[i]) max = a[i];
2635-
if (min > a[i]) min = a[i];
2667+
if (min > a[i] && a[i] > INT64_MIN+1) min = a[i];
26362668
}
2669+
26372670
if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2638-
bcf_enc_size(s, wsize, BCF_BT_INT8);
2639-
for (i = 0; i < n; ++i)
2640-
if ( a[i]==bcf_int64_vector_end ) kputc(bcf_int8_vector_end, s);
2641-
else if ( a[i]==bcf_int64_missing ) kputc(bcf_int8_missing, s);
2642-
else kputc(a[i], s);
2671+
if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2672+
ks_resize(s, s->l + n) < 0)
2673+
return -1;
2674+
uint8_t *p = (uint8_t *) s->s + s->l;
2675+
for (i = 0; i < n; ++i, p++) {
2676+
if ( a[i]==bcf_int64_vector_end ) *p = bcf_int8_vector_end;
2677+
else if ( a[i]==bcf_int64_missing ) *p = bcf_int8_missing;
2678+
else *p = a[i];
2679+
}
2680+
s->l += n;
26432681
} else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
26442682
uint8_t *p;
2645-
bcf_enc_size(s, wsize, BCF_BT_INT16);
2646-
ks_resize(s, s->l + n * sizeof(int16_t));
2683+
if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2684+
ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2685+
return -1;
26472686
p = (uint8_t *) s->s + s->l;
26482687
for (i = 0; i < n; ++i)
26492688
{
@@ -2657,8 +2696,9 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)
26572696
s->l += n * sizeof(int16_t);
26582697
} else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){
26592698
uint8_t *p;
2660-
bcf_enc_size(s, wsize, BCF_BT_INT32);
2661-
ks_resize(s, s->l + n * sizeof(int32_t));
2699+
if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2700+
ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2701+
return -1;
26622702
p = (uint8_t *) s->s + s->l;
26632703
for (i = 0; i < n; ++i) {
26642704
int32_t x;
@@ -2670,11 +2710,11 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)
26702710
}
26712711
s->l += n * sizeof(int32_t);
26722712
}
2673-
#ifdef VCF_ALLOW_INT64
2713+
#ifdef VCF_ALLOW_INT64
26742714
else {
26752715
uint8_t *p;
2676-
bcf_enc_size(s, wsize, BCF_BT_INT64);
2677-
ks_resize(s, s->l + n * sizeof(int64_t));
2716+
if(bcf_enc_size(s, wsize, BCF_BT_INT64) < 0 || ks_resize(s, s->l + n * sizeof(int64_t)) < 0)
2717+
return -1;
26782718
p = (uint8_t *) s->s + s->l;
26792719
for (i = 0; i < n; ++i) {
26802720
int64_t x = a[i];
@@ -2688,7 +2728,7 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)
26882728
#endif
26892729
}
26902730

2691-
return 0; // FIXME: check for errs in this function
2731+
return 0;
26922732
}
26932733

26942734
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
@@ -3524,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
35243564
v->unpacked |= BCF_IS_64BIT;
35253565
bcf_enc_vlong(str, n_val, a_val, -1);
35263566
val1 = a_val[0];
3527-
if (n_val==1 && strcmp(key, "END") == 0)
3567+
if (n_val==1 && strcmp(key, "END") == 0)//memset instead of strcmp
35283568
{
35293569
if ( val1 <= v->pos )
35303570
{
@@ -3586,6 +3626,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
35863626
// Ensure string we parse has space to permit some over-flow when during
35873627
// parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over
35883628
// the more straight forward looking strcmp, giving a speed advantage.
3629+
/*
35893630
if (ks_resize(s, s->l+4) < 0)
35903631
return -1;
35913632
@@ -3598,7 +3639,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
35983639
s->s[s->l+1] = 0;
35993640
s->s[s->l+2] = 0;
36003641
s->s[s->l+3] = 0;
3601-
3642+
*/ // commented out the part that was required for optimization in vcf_parse_info function, will take a look later.
36023643
bcf_clear1(v);
36033644
str = &v->shared;
36043645
memset(&aux, 0, sizeof(ks_tokaux_t));
@@ -4657,7 +4698,7 @@ int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_l
46574698
// operations do not really care about a few malformed lines).
46584699
// In the future we may want to add a strict mode that errors in
46594700
// this case.
4660-
if ( strncmp("#CHROM\tPOS",p,10) != 0 ) {
4701+
if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
46614702
char *eol = strchr(p, '\n');
46624703
if (*p != '\0') {
46634704
char buffer[320];

0 commit comments

Comments
 (0)