-
Notifications
You must be signed in to change notification settings - Fork 0
/
snpSeg.c
184 lines (171 loc) · 6.43 KB
/
snpSeg.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/*
* =====================================================================================
*
* Filename: snpSeg.c
*
* Description: 用来生成所有中间位置为snp的segment。
* segment长49bp,中间元素为snp。
*
* Version: 1.0
* Created: 09/22/2012 02:04:11 PM
* Revision: none
* Compiler: gcc
*
* Author: Quan, Wei (mn), wquanhit@gmail.com
* Company: BIC, HIT, China
*
* =====================================================================================
*/
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <time.h>
#include "kseq.h"
#include "hapmap.h"
KSEQ_INIT(gzFile, gzread)
int ss_core(const char *fn_ref, const char *fn_hm, const char *fn_out)
{
clock_t ss_startTime, ss_endTime;
ss_startTime = clock();
int i, j;
gzFile fp_ref = Z_NULL;FILE *fp_hm = NULL, *fp_out= NULL;
kseq_t *seq; hapmap_t *hm;
//init
fp_ref = gzopen(fn_ref, "r");
if (fp_ref == Z_NULL){
fprintf(stderr, "gzopen %s fail!\n", argv[1]);
exit(1);
}
fp_hm = fopen(fn_hm, "r");
if (fp_hm == NULL){
fprintf(stderr, "fopen %s fail!\n", argv[2]);
exit(1);
}
fp_out = fopen(fn_out, "w");
if (fp_out == NULL){
fprintf(stderr, "fopen %s fail!\n", argv[3]);
exit(1);
}
seq = kseq_init(fp_ref);
hm = hapmap_init(fp_hm);
//core loop
int l; uint32_t tot_l = 0;
uint32_t snp_tot_num = 0;
//uint32_t segment_refid;
while( (l = kseq_read(seq)) > 0 )
{
uint32_t *snp_pos, snp_num; uint8_t *snp_type;
const int WIN_SNP_DISTANCE = 24;//窗口中核心snp距离最左端的距离,也是最右端距离
int win_start, win_end;
int win_snp_start,win_snp_end, win_snp_num, win_snp_mid;
int win_segment_num, segment_num_factor1, segment_num_factor2;
char snp_type_cur;
fprintf(stderr, "[GLP]: generate local pattern in %s.\n", seq->name.s);
hapmap_readhm(hm);
snp_num = hm->snp_num;
snp_pos = hm->snp_pos;
snp_type = hm->snp_type;
if ( strcmp(hm->chrID, seq->name.s) != 0 ){
fprintf(stderr, "ref chrID not match hapmap chrID!\n");
exit(1);
}
win_snp_mid = 0;
while(win_snp_mid < snp_num){
//compute snp num in current window
// [win_snp_start, win_snp_end)
win_snp_start = win_snp_mid;
win_snp_end = win_snp_start +1;
if(win_snp_start >= 1){
while( snp_pos[win_snp_mid] - snp_pos[win_snp_start -1] <= WIN_SNP_DISTANCE){
--win_snp_start;
if(win_snp_start == 0) break;
}
}
if(win_snp_end <= snp_num){
while( snp_pos[win_snp_end]- snp_pos[win_snp_mid] <= WIN_SNP_DISTANCE ){
++win_snp_end;
if(win_snp_end == snp_num ) break;
}
}
win_snp_num = win_snp_end - win_snp_start;
if(win_snp_num > WIN_MAX_SNP_NUM){
win_snp_mid++;
continue;
}
//compute window boundery
//win_start = snp_pos[win_snp_start] - WIN_SNP_DISTANCE;
//win_end = snp_pos[win_snp_end-1] +WIN_SNP_DISTANCE;
win_start = (snp_pos[win_snp_mid] > WIN_SNP_DISTANCE)? (snp_pos[win_snp_mid]- WIN_SNP_DISTANCE) : 0;
win_end = snp_pos[win_snp_mid] + WIN_SNP_DISTANCE < l ?
snp_pos[win_snp_mid] + WIN_SNP_DISTANCE : l-1; ;
//compute segment num in current window
win_segment_num = hapmap_get_snptypenum(snp_type[win_snp_start]);
for(i = 1; i < win_snp_num; ++i){
win_segment_num *= hapmap_get_snptypenum(snp_type[win_snp_start+i]);
}
// fprintf(fp_out, ">%d\t%u\t%u\n", snp_tot_num++, snp_pos[win_snp_mid]+tot_l -24, snp_pos[win_snp_mid]);
fprintf(fp_out, ">%d\t%u\n", snp_tot_num++, snp_pos[win_snp_mid]+tot_l +24);
if(snp_tot_num == 1){
fputc('#', fp_out);
}
//遍历该window中每个segment
for(i = 0; i < win_segment_num; ++i){
int snp_type_i;
int k = i;
//if (i == segment_refid) continue;
//generate cur segment sequence
segment_num_factor1 = 1;
for(j = 0; j < win_snp_num; ++j){
snp_type_cur = snp_type[win_snp_start+j];
segment_num_factor1 *= hapmap_get_snptypenum(snp_type_cur);
segment_num_factor2 = win_segment_num / segment_num_factor1;
snp_type_i = k/segment_num_factor2;
if( win_snp_mid - win_snp_start == j
&&hapmap_nt2snptypei(snp_type_cur, snp_type_cur>>4) == snp_type_i){
break;
}
k -= snp_type_i * segment_num_factor2;
seq->seq.s[snp_pos[win_snp_start+j]] = "ACGTN"[hapmap_get_snptype(snp_type_cur, snp_type_i)];
}
//output current segment
//fprintf(fp_out, ">Seg%d\t%d\t%s\n", snp_tot_num++, hm->snp_pos[win_snp_start], hm->chrID);
if( win_snp_mid - win_snp_start == j
&&hapmap_nt2snptypei(snp_type_cur, snp_type_cur>>4) == snp_type_i){
continue;
}
for(j = win_start; j <= win_end; ++j){
fputc(seq->seq.s[j], fp_out);
}
fputc('#', fp_out);fputc('\n', fp_out);
}
++win_snp_mid;
}//end while snp oteration
tot_l += l;
}//end while chrome iteration
//destroy
kseq_destroy(seq);
hapmap_destroy(hm);
{
gzclose(fp_ref);
fclose(fp_hm);
fclose(fp_out);
}
ss_endTime = clock();
fprintf(stderr, "[GLP]:time escaped %lu sec.\n", (ss_endTime - ss_startTime)/CLOCKS_PER_SEC);
}
#define WIN_MAX_SNP_NUM 6
int ss_main(int argc, char *argv[])
{
if (argc < 4){
fprintf(stderr, "Usage: ss3 <in.fasta> <in.hapmap> <out>\n");
return 1;
}
ss_core(argv[1], argv[2], argv[3]);
return 0;
}
#ifdef MAIN_SS
int main(int argc, char *argv[])
{
return ss_main(argc, argv);
}
#endif