/
hgLoadOut.c
389 lines (353 loc) · 11.4 KB
/
hgLoadOut.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
/* hgLoadOut - load RepeatMasker .out files into database. */
/* Copyright (C) 2014 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "options.h"
#include "cheapcgi.h"
#include "hCommon.h"
#include "hdb.h"
#include "jksql.h"
#include "rmskOut.h"
char *createRmskOut = "CREATE TABLE %s (\n"
" bin smallint unsigned not null, # bin index field for range queries\n"
" swScore int unsigned not null, # Smith Waterman alignment score\n"
" milliDiv int unsigned not null, # Base mismatches in parts per thousand\n"
" milliDel int unsigned not null, # Bases deleted in parts per thousand\n"
" milliIns int unsigned not null, # Bases inserted in parts per thousand\n"
" genoName varchar(255) not null, # Genomic sequence name\n"
" genoStart int unsigned not null, # Start in genomic sequence\n"
" genoEnd int unsigned not null, # End in genomic sequence\n"
" genoLeft int not null, # -#bases after match in genomic sequence\n"
" strand char(1) not null, # Relative orientation + or -\n"
" repName varchar(255) not null, # Name of repeat\n"
" repClass varchar(255) not null, # Class of repeat\n"
" repFamily varchar(255) not null, # Family of repeat\n"
" repStart int not null, # Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence\n"
" repEnd int not null, # End in repeat sequence\n"
" repLeft int not null, # -#bases after match (if strand is +) or start (if strand is -) in repeat sequence\n"
" id char(1) not null, # First digit of id field in RepeatMasker .out file. Best ignored.\n"
" #Indices\n";
boolean noBin = FALSE;
boolean split = FALSE;
boolean noSplit = FALSE;
char *tabFileName = NULL;
char *suffix = NULL;
int badRepCnt = 0;
/* command line option specifications */
static struct optionSpec optionSpecs[] = {
{"tabFile", OPTION_STRING},
{"tabfile", OPTION_STRING},
{"nosplit", OPTION_BOOLEAN},
{"noSplit", OPTION_BOOLEAN},
{"split", OPTION_BOOLEAN},
{"table", OPTION_STRING},
{NULL, 0}
};
void usage()
/* Explain usage and exit. */
{
errAbort(
"hgLoadOut - load RepeatMasker .out files into database\n"
"usage:\n"
" hgLoadOut database file(s).out\n"
"For multiple files chrN.out this will create the single table 'rmsk'\n"
"in the database, use the -split argument to obtain separate chrN_rmsk tables.\n"
"options:\n"
" -tabFile=text.tab - don't actually load database, just create tab file\n"
" -split - load chrN_rmsk separate tables even if a single file is given\n"
" -table=name - use a different suffix other than the default (rmsk)");
}
void badFormat(struct lineFile *lf, int id)
/* Print generic bad format message. */
{
errAbort("Badly formatted line %d in %s\n", lf->lineIx, lf->fileName);
}
int makeMilli(char *s, struct lineFile *lf)
/* Convert ascii floating point to parts per thousand representation. */
{
/* Cope with -0.0 and -0.2 etc.*/
if (s[0] == '-')
{
if (!sameString(s, "-0.0"))
warn("Strange perc. field %s line %d of %s", s, lf->lineIx, lf->fileName);
s = "0.0";
}
if (!isdigit(s[0]))
badFormat(lf,1);
return round(10.0*atof(s));
}
int parenSignInt(char *s, struct lineFile *lf)
/* Convert from ascii notation where a parenthesized integer is negative
* to straight int. */
{
if (s[0] == '(')
return -atoi(s+1);
else
return atoi(s);
}
boolean checkRepeat(struct rmskOut *r, struct lineFile *lf)
/* check for bogus repeat */
{
/* this is bogus on both strands */
if (r->repEnd < 0 || r->repStart > r->repEnd)
{
badRepCnt++;
if (verboseLevel() > 1)
{
verbose(2, "bad rep range [%d, %d] line %d of %s \n",
r->repStart, r->repEnd, lf->lineIx, lf->fileName);
}
return FALSE;
}
return TRUE;
}
FILE *theFile = NULL;
struct hash *chromFpHash = NULL;
char *defaultTempName = "rmsk.tab";
FILE *getFileForChrom(char *chrom)
/* Return the appropriate file pointer for the given chrom if -split.
* If not using -split, then there is only one file pointer in action. */
{
static int chromCount = 0;
char *tempName = tabFileName ? tabFileName : defaultTempName;
if (split)
{
if (chromFpHash == NULL)
chromFpHash = hashNew(10);
theFile = (FILE *)hashFindVal(chromFpHash, chrom);
if (theFile == NULL)
{
char fName[512];
if (chromCount > 1000)
{
/* We will probably run into the operating system limit on open
* files well before this point, but just in case... */
errAbort("-split should not be used when there are >1000 sequences"
" -- too many split tables.");
}
safef(fName, sizeof(fName), "%s_%s", chrom, tempName);
theFile = mustOpen(fName, "w");
hashAdd(chromFpHash, chrom, theFile);
chromCount++;
}
}
else
{
if (theFile == NULL)
theFile = mustOpen(tempName, "w");
}
return theFile;
}
void helCarefulClose(struct hashEl *hel)
/* Call carefulClose on hashed file pointer. */
{
FILE *f = (FILE *)(hel->val);
carefulClose(&f);
}
void closeFiles()
/* If split, close each file pointer in chromFpHash. Otherwise close the
* single file pointer that we've been using. */
{
if (split)
{
hashTraverseEls(chromFpHash, helCarefulClose);
}
else
carefulClose(&theFile);
}
void readOneOut(char *rmskFile)
/* Read .out file rmskFile, check each line, and print OK lines to .tab. */
{
struct lineFile *lf;
char *line, *words[24];
int lineSize, wordCount;
/* Open .out file and process header. */
lf = lineFileOpen(rmskFile, TRUE);
if (!lineFileNext(lf, &line, &lineSize))
errAbort("Empty %s", lf->fileName);
if (!startsWith(" SW perc perc", line))
{
if (!startsWith(" SW perc perc", line))
errAbort("%s doesn't seem to be a RepeatMasker .out file, first "
"line seen:\n%s", lf->fileName, line);
}
lineFileNext(lf, &line, &lineSize);
lineFileNext(lf, &line, &lineSize);
/* Process line oriented records of .out file. */
while (lineFileNext(lf, &line, &lineSize))
{
static struct rmskOut r;
char *s;
wordCount = chopLine(line, words);
if (wordCount < 14)
errAbort("Expecting 14 or 15 words line %d of %s",
lf->lineIx, lf->fileName);
r.swScore = atoi(words[0]);
r.milliDiv = makeMilli(words[1], lf);
r.milliDel = makeMilli(words[2], lf);
r.milliIns = makeMilli(words[3], lf);
r.genoName = words[4];
r.genoStart = atoi(words[5])-1;
r.genoEnd = atoi(words[6]);
r.genoLeft = parenSignInt(words[7], lf);
r.strand[0] = (words[8][0] == '+' ? '+' : '-');
r.repName = words[9];
r.repClass = words[10];
char *repClassTest = cloneString(r.repClass);
stripChar(repClassTest, '(');
stripChar(repClassTest, ')');
int nonDigitCount = countLeadingNondigits(repClassTest);
int wordOffset = 0;
// this repClass is only digits, (or only (digits) with surrounding parens)
// this is the sign of an empty field here
// due to custom library in use that has no class/family indication
if (0 == nonDigitCount)
{
wordOffset = 1;
r.repClass = cloneString("Unspecified");
r.repFamily = cloneString("Unspecified");
}
else
{
s = strchr(r.repClass, '/');
if (s == NULL)
r.repFamily = r.repClass;
else
{
*s++ = 0;
r.repFamily = s;
}
}
r.repStart = parenSignInt(words[11-wordOffset], lf);
r.repEnd = atoi(words[12-wordOffset]);
r.repLeft = parenSignInt(words[13-wordOffset], lf);
r.id[0] = ((wordCount > (14-wordOffset)) ? words[14-wordOffset][0] : ' ');
if (checkRepeat(&r, lf))
{
FILE *f = getFileForChrom(r.genoName);
if (!noBin)
fprintf(f, "%u\t", hFindBin(r.genoStart, r.genoEnd));
rmskOutTabOut(&r, f);
}
}
}
void loadOneTable(char *database, struct sqlConnection *conn, char *tempName, char *tableName)
/* Load .tab file tempName into tableName and remove tempName. */
{
struct dyString *query = newDyString(1024);
verbose(1, "Loading up table %s\n", tableName);
if (sqlTableExists(conn, tableName))
{
sqlDyStringPrintf(query, "DROP table %s", tableName);
sqlUpdate(conn, query->string);
}
/* Create first part of table definitions, the fields. */
dyStringClear(query);
sqlDyStringPrintf(query, createRmskOut, tableName);
/* Create the indexes */
if (!noSplit)
{
dyStringAppend(query, " INDEX(bin))\n");
}
else
{
int indexLen = hGetMinIndexLength(database);
sqlDyStringPrintf(query, " INDEX(genoName(%d),bin))\n", indexLen);
}
sqlUpdate(conn, query->string);
/* Load database from tab-file. */
dyStringClear(query);
sqlDyStringPrintf(query, "LOAD data local infile '%s' into table %s",
tempName, tableName);
sqlUpdate(conn, query->string);
remove(tempName);
}
void processOneOut(char *database, struct sqlConnection *conn, char *rmskFile, char *suffix)
/* Read one RepeatMasker .out file and load it into database. */
{
verbose(1, "Processing %s\n", rmskFile);
if (split || noSplit)
errAbort("program error: processOneOut doesn't do -split or -nosplit.");
readOneOut(rmskFile);
/* Create database table (if not -tabFile). */
if (tabFileName == NULL)
{
char dir[256], base[128], extension[64];
char tableName[256];
splitPath(rmskFile, dir, base, extension);
chopSuffix(base);
safef(tableName, sizeof(tableName), "%s_%s", base, suffix);
closeFiles();
loadOneTable(database, conn, defaultTempName, tableName);
}
}
struct sqlConnection *theConn = NULL;
void loadOneSplitTable(struct hashEl *hel)
/* Load the table for the given chrom. */
{
char tempName[512];
char tableName[256];
char *chrom = hel->name;
safef(tempName, sizeof(tempName), "%s_%s", chrom, defaultTempName);
safef(tableName, sizeof(tableName), "%s_%s", chrom, suffix);
loadOneTable(sqlGetDatabase(theConn), theConn, tempName, tableName);
}
void loadSplitTables(char *database, struct sqlConnection *conn)
/* For each chrom in chromHash, load its tempfile into table chrom_suffix. */
{
theConn = conn;
hashTraverseEls(chromFpHash, loadOneSplitTable);
}
void hgLoadOut(char *database, int rmskCount, char *rmskFileNames[], char *suffix)
/* hgLoadOut - load RepeatMasker .out files into database. */
{
struct sqlConnection *conn = NULL;
int i;
if (tabFileName == NULL)
{
conn = hAllocConn(database);
verbose(2,"#\thgLoadOut: connected to database: %s\n", database);
}
for (i=0; i<rmskCount; ++i)
{
if (split || noSplit)
readOneOut(rmskFileNames[i]);
else
processOneOut(database, conn, rmskFileNames[i], suffix);
}
closeFiles();
if (tabFileName == NULL)
{
if (split)
loadSplitTables(database, conn);
else if (noSplit)
loadOneTable(database, conn, defaultTempName, suffix);
}
hFreeConn(&conn);
if (badRepCnt > 0)
{
warn("note: %d records dropped due to repEnd < 0 or repStart > repEnd\n", badRepCnt);
if (verboseLevel() < 2)
warn(" run with -verbose=2 for details\n");
}
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
if (argc < 3)
usage();
// default changed to noSplit Jan 2014 , ignore noSplit arguments
split = optionExists("split");
noSplit = ! split;
suffix = optionVal("table", "rmsk");
tabFileName = optionVal("tabFile", tabFileName);
if (tabFileName == NULL)
tabFileName = optionVal("tabfile", tabFileName);
if (split && noSplit)
errAbort("-split and -nosplit cannot be used together.");
hgLoadOut(argv[1], argc-2, argv+2, suffix) ;
return 0;
}