/
udc.c
2104 lines (1922 loc) · 62.8 KB
/
udc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (C) 2014 The Regents of the University of California
* See README in this or parent directory for licensing information. */
/* udc - url data cache - a caching system that keeps blocks of data fetched from URLs in
* sparse local files for quick use the next time the data is needed.
*
* This cache is enormously simplified by there being no local _write_ to the cache,
* just reads.
*
* The overall strategy of the implementation is to have a root cache directory
* with a subdir for each file being cached. The directory for a single cached file
* contains two files - "bitmap" and "sparseData" that contains information on which
* parts of the URL are cached and the actual cached data respectively. The subdirectory name
* associated with the file is constructed from the URL in a straightforward manner.
* http://genome.ucsc.edu/cgi-bin/hgGateway
* gets mapped to:
* rootCacheDir/http/genome.ucsc.edu/cgi-bin/hgGateway/
* The URL protocol is the first directory under the root, and the remainder of the
* URL, with some necessary escaping, is used to define the rest of the cache directory
* structure, with each '/' after the protocol line translating into another directory
* level.
*
* The bitmap file contains time stamp and size data as well as an array with one bit
* for each block of the file that has been fetched. Currently the block size is 8K. */
#include <sys/file.h>
#include <sys/mman.h>
#include "common.h"
#include "hash.h"
#include "obscure.h"
#include "bits.h"
#include "linefile.h"
#include "portable.h"
#include "sig.h"
#include "net.h"
#include "cheapcgi.h"
#include "htmlPage.h"
#include "udc.h"
#include "hex.h"
#include <dirent.h>
#include <openssl/sha.h>
/* The stdio stream we'll use to output statistics on file i/o. Off by default. */
FILE *udcLogStream = NULL;
void udcSetLog(FILE *fp)
/* Turn on logging of file i/o.
* For each UDC file two lines are written. One line for the open, and one line for the close.
* The Open line just has the URL being opened.
* The Close line has the the URL plus a bunch of counts of the number of seeks, reads, and writes
* for the following four files: the udc bitmap, the udc sparse data, the incoming calls
* to the UDC layer, and the network connection to the (possibly) remote file.
* There are two additional counts: the number of socket connects, and the
* number of times a socket is reused instead of closed and reopened.
*/
{
udcLogStream = fp;
fprintf(fp, "Begin\n");
}
struct ioStats
/* Statistics concerning reads and seeks. */
{
bits64 numSeeks; /* The number of seeks on this file */
bits64 numReads; /* The number of reads from this file */
bits64 bytesRead; /* The number of bytes read from this file */
bits64 numWrites; /* The number of writes to this file */
bits64 bytesWritten; /* The number of bytes written to this file */
};
struct ios
/* Statistics concerning reads and seeks for sparse, bitmap, url, and to us. */
{
struct ioStats bit; /* Statistics on file i/o to the bitmap file. */
struct ioStats sparse; /* Statistics on file i/o to the sparse data file. */
struct ioStats udc; /* Statistics on file i/o from the application to us. */
struct ioStats net; /* Statistics on file i/o over the network. */
bits64 numConnects; /* The number of socket connections made. */
bits64 numReuse; /* The number of socket reuses. */
};
#define udcBlockSize (8*1024)
/* All fetch requests are rounded up to block size. */
#define udcMaxBytesPerRemoteFetch (udcBlockSize * 32)
/* Very large remote reads are broken down into chunks this size. */
struct connInfo
/* Socket descriptor and associated info, for keeping net connections open. */
{
int socket; /* Socket descriptor for data connection (or 0). */
bits64 offset; /* Current file offset of socket. */
int ctrlSocket; /* (FTP only) Control socket descriptor or 0. */
char *redirUrl; /* (HTTP(S) only) use redirected url */
};
typedef int (*UdcDataCallback)(char *url, bits64 offset, int size, void *buffer,
struct udcFile *file);
/* Type for callback function that fetches file data. */
struct udcRemoteFileInfo
/* Information about a remote file. */
{
bits64 updateTime; /* Last update in seconds since 1970 */
bits64 size; /* Remote file size */
struct connInfo ci; /* Connection info for open net connection */
};
typedef boolean (*UdcInfoCallback)(char *url, struct udcRemoteFileInfo *retInfo);
/* Type for callback function that fetches file timestamp and size. */
struct udcProtocol
/* Something to handle a communications protocol like http, https, ftp, local file i/o, etc. */
{
struct udcProtocol *next; /* Next in list */
UdcDataCallback fetchData; /* Data fetcher */
UdcInfoCallback fetchInfo; /* Timestamp & size fetcher */
char *type;
};
struct udcFile
/* A file handle for our caching system. */
{
struct udcFile *next; /* Next in list. */
char *url; /* Name of file - includes protocol */
char *protocol; /* The URL up to the first colon. http: etc. */
struct udcProtocol *prot; /* Protocol specific data and methods. */
time_t updateTime; /* Last modified timestamp. */
bits64 size; /* Size of file. */
bits64 offset; /* Current offset in file. */
char *cacheDir; /* Directory for cached file parts. */
char *bitmapFileName; /* Name of bitmap file. */
char *sparseFileName; /* Name of sparse data file. */
char *redirFileName; /* Name of redir file. */
int fdSparse; /* File descriptor for sparse data file. */
boolean sparseReadAhead; /* Read-ahead has something in the buffer */
char *sparseReadAheadBuf; /* Read-ahead buffer, if any */
bits64 sparseRAOffset; /* Read-ahead buffer offset */
struct udcBitmap *bits; /* udcBitMap */
bits64 startData; /* Start of area in file we know to have data. */
bits64 endData; /* End of area in file we know to have data. */
bits32 bitmapVersion; /* Version of associated bitmap we were opened with. */
struct connInfo connInfo; /* Connection info for open net connection. */
void *mmapBase; /* pointer to memory address if file has been mmapped, or NULL */
struct ios ios; /* Statistics on file access. */
};
struct udcBitmap
/* The control structure including the bitmap of blocks that are cached. */
{
struct udcBitmap *next; /* Next in list. */
bits32 blockSize; /* Number of bytes per block of file. */
bits64 remoteUpdate; /* Remote last update time. */
bits64 fileSize; /* File size */
bits32 version; /* Version - increments each time cache is stale. */
bits64 localUpdate; /* Time we last fetched new data into cache. */
bits64 localAccess; /* Time we last accessed data. */
boolean isSwapped; /* If true need to swap all bytes on read. */
int fd; /* File descriptor for file with current block. */
};
static char *bitmapName = "bitmap";
static char *sparseDataName = "sparseData";
static char *redirName = "redir";
#define udcBitmapHeaderSize (64)
static int cacheTimeout = 0;
#define MAX_SKIP_TO_SAVE_RECONNECT (udcMaxBytesPerRemoteFetch / 2)
static off_t ourMustLseek(struct ioStats *ioStats, int fd, off_t offset, int whence)
{
ioStats->numSeeks++;
return mustLseek(fd, offset, whence);
}
static void ourMustWrite(struct ioStats *ioStats, int fd, void *buf, size_t size)
{
ioStats->numWrites++;
ioStats->bytesWritten += size;
mustWriteFd(fd, buf, size);
}
static size_t ourRead(struct ioStats *ioStats, int fd, void *buf, size_t size)
{
ioStats->numReads++;
size_t bytesRead = read(fd, buf, size);
ioStats->bytesRead += bytesRead;
return bytesRead;
}
static void ourMustRead(struct ioStats *ioStats, int fd, void *buf, size_t size)
{
ioStats->numReads++;
ioStats->bytesRead += size;
mustReadFd(fd, buf, size);
}
static size_t ourFread(struct ioStats *ioStats, void *buf, size_t size, size_t nmemb, FILE *stream)
{
ioStats->numReads++;
ioStats->bytesRead += size * nmemb;
return fread(buf, size, nmemb, stream);
}
static void udcReadAndIgnore(struct ioStats *ioStats, int sd, bits64 size)
/* Read size bytes from sd and return. */
{
static char *buf = NULL;
if (buf == NULL)
buf = needMem(udcBlockSize);
bits64 remaining = size, total = 0;
while (remaining > 0)
{
bits64 chunkSize = min(remaining, udcBlockSize);
ssize_t rd = ourRead(ioStats, sd, buf, chunkSize);
if (rd < 0)
errnoAbort("udcReadAndIgnore: error reading socket after %lld bytes", total);
remaining -= rd;
total += rd;
}
if (total < size)
errAbort("udcReadAndIgnore: got EOF at %lld bytes (wanted %lld)", total, size);
}
static int connInfoGetSocket(struct udcFile *file, char *url, bits64 offset, int size)
/* If ci has an open socket and the given offset matches ci's current offset,
* reuse ci->socket. Otherwise close the socket, open a new one, and update ci,
* or return -1 if there is an error opening a new one. */
{
/* NOTE: This doesn't use HTTP 1.1 keep alive to do multiple request on the
* same socket. The only way subsequent random requests on the same socket
* work is because previous request are open-ended and this can continue
* reading where it left off. The HTTP requests are issued as 1.0, even
* through range requests are a 1.1 feature.
*
* For FTP, the serial read approach is essential. FTP only supports resuming
* from an offset, but doesn't not support limiting the number of bytes
* transferred. All that can be done to stop the transfer is to abort the
* operation, when then requires reconnecting.
*/
struct connInfo *ci = &file->connInfo;
if (ci != NULL && ci->socket > 0 && ci->offset != offset)
{
bits64 skipSize = (offset - ci->offset);
if (skipSize > 0 && skipSize <= MAX_SKIP_TO_SAVE_RECONNECT)
{
verbose(4, "skipping %lld bytes @%lld to avoid reconnect\n", skipSize, ci->offset);
udcReadAndIgnore(&file->ios.net, ci->socket, skipSize);
ci->offset = offset;
file->ios.numReuse++;
}
else
{
verbose(4, "Offset mismatch (ci %lld != new %lld), reopening.\n", ci->offset, offset);
mustCloseFd(&(ci->socket));
if (ci->ctrlSocket > 0)
mustCloseFd(&(ci->ctrlSocket));
ZeroVar(ci);
}
}
int sd;
if (ci == NULL || ci->socket <= 0)
{
file->ios.numConnects++;
if (ci->redirUrl)
{
url = transferParamsToRedirectedUrl(url, ci->redirUrl);
}
// IMPORTANT NOTE: byterange is not a real URL parameter, this is a hack to pass
// the range to the net.c functions, which then parse it.
char rangeUrl[2048];
if (ci == NULL)
{
safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-%lld",
url, offset, (offset + size - 1));
sd = netUrlOpen(rangeUrl);
}
else
{
safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-", url, offset);
sd = ci->socket = netUrlOpenSockets(rangeUrl, &(ci->ctrlSocket));
ci->offset = offset;
}
if (sd < 0)
return -1;
if (startsWith("http", url))
{
char *newUrl = NULL;
int newSd = 0;
if (!netSkipHttpHeaderLinesHandlingRedirect(sd, rangeUrl, &newSd, &newUrl))
return -1;
if (newUrl)
{
freeMem(newUrl);
sd = newSd;
if (ci != NULL)
ci->socket = newSd;
}
}
}
else
sd = ci->socket;
return sd;
}
/********* Section for local file protocol **********/
static char *assertLocalUrl(char *url)
/* Make sure that url is local and return bits past the protocol. */
{
if (startsWith("local:", url))
url += 6;
if (url[0] != '/')
errAbort("Local urls must start at /");
if (stringIn("..", url) || stringIn("~", url) || stringIn("//", url) ||
stringIn("/./", url) || endsWith("/.", url))
{
errAbort("relative paths not allowed in local urls (%s)", url);
}
return url;
}
static int udcDataViaLocal(char *url, bits64 offset, int size, void *buffer, struct udcFile *file)
/* Fetch a block of data of given size into buffer using the http: protocol.
* Returns number of bytes actually read. Does an errAbort on
* error. Typically will be called with size in the 8k - 64k range. */
{
/* Need to check time stamp here. */
verbose(4, "reading remote data - %d bytes at %lld - on %s\n", size, offset, url);
url = assertLocalUrl(url);
FILE *f = mustOpen(url, "rb");
fseek(f, offset, SEEK_SET);
int sizeRead = ourFread(&file->ios.net, buffer, 1, size, f);
if (ferror(f))
{
warn("udcDataViaLocal failed to fetch %d bytes at %lld", size, offset);
errnoAbort("file %s", url);
}
carefulClose(&f);
return sizeRead;
}
static boolean udcInfoViaLocal(char *url, struct udcRemoteFileInfo *retInfo)
/* Fill in *retTime with last modified time for file specified in url.
* Return FALSE if file does not even exist. */
{
verbose(4, "checking remote info on %s\n", url);
url = assertLocalUrl(url);
struct stat status;
int ret = stat(url, &status);
if (ret < 0)
return FALSE;
retInfo->updateTime = status.st_mtime;
retInfo->size = status.st_size;
return TRUE;
}
/********* Section for transparent file protocol **********/
static int udcDataViaTransparent(char *url, bits64 offset, int size, void *buffer,
struct udcFile *file)
/* Fetch a block of data of given size into buffer using the http: protocol.
* Returns number of bytes actually read. Does an errAbort on
* error. Typically will be called with size in the 8k - 64k range. */
{
internalErr(); /* Should not get here. */
return size;
}
static boolean udcInfoViaTransparent(char *url, struct udcRemoteFileInfo *retInfo)
/* Fill in *retInfo with last modified time for file specified in url.
* Return FALSE if file does not even exist. */
{
internalErr(); /* Should not get here. */
return FALSE;
}
/********* Section for slow local file protocol - simulates network... **********/
static int udcDataViaSlow(char *url, bits64 offset, int size, void *buffer, struct udcFile *file)
/* Fetch a block of data of given size into buffer using the http: protocol.
* Returns number of bytes actually read. Does an errAbort on
* error. Typically will be called with size in the 8k - 64k range. */
{
verbose(4, "slow reading remote data - %d bytes at %lld - on %s\n", size, offset, url);
sleep1000(500);
char *fileName = url + 5; /* skip over 'slow:' */
FILE *f = mustOpen(fileName, "rb");
fseek(f, offset, SEEK_SET);
char *pt = buffer;
int i, step=1024;
int sizeRead = 0;
for (i=0; i<size; i += step)
{
sleep1000(250);
int readChunk = size - i;
if (readChunk > step)
readChunk = step;
int oneReadSize = ourFread(&file->ios.net, pt, 1, readChunk, f);
verbose(4, "slowly read %d bytes\n", oneReadSize);
if (ferror(f))
{
warn("udcDataViaSlow failed to fetch %d bytes at %lld", size, offset);
errnoAbort("file %s", fileName);
}
pt += step;
sizeRead += oneReadSize;
}
carefulClose(&f);
return sizeRead;
}
static boolean udcInfoViaSlow(char *url, struct udcRemoteFileInfo *retInfo)
/* Fill in *retTime with last modified time for file specified in url.
* Return FALSE if file does not even exist. */
{
char *fileName = url + 5; /* skip over 'slow:' */
verbose(4, "slow checking remote info on %s\n", url);
sleep1000(500);
struct stat status;
int ret = stat(fileName, &status);
if (ret < 0)
return FALSE;
retInfo->updateTime = status.st_mtime;
retInfo->size = status.st_size;
return TRUE;
}
/********* Section for http protocol **********/
static char *defaultDir = "/tmp/udcCache";
char *udcDefaultDir()
/* Get default directory for cache */
{
return defaultDir;
}
void udcSetDefaultDir(char *path)
/* Set default directory for cache. */
{
defaultDir = cloneString(path);
}
void udcDisableCache()
/* Switch off caching. Re-enable with udcSetDefaultDir */
{
defaultDir = NULL;
}
static bool udcCacheEnabled()
/* TRUE if caching is activated */
{
return (defaultDir != NULL);
}
int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file)
/* Fetch a block of data of given size into buffer using url's protocol,
* which must be http, https or ftp. Returns number of bytes actually read.
* Does an errAbort on error.
* Typically will be called with size in the 8k-64k range. */
{
if (startsWith("http://",url) || startsWith("https://",url) || startsWith("ftp://",url))
verbose(4, "reading http/https/ftp data - %d bytes at %lld - on %s\n", size, offset, url);
else
errAbort("Invalid protocol in url [%s] in udcDataViaFtp, only http, https, or ftp supported",
url);
int sd = connInfoGetSocket(file, url, offset, size);
if (sd < 0)
errAbort("Can't get data socket for %s", url);
int rd = 0, total = 0, remaining = size;
char *buf = (char *)buffer;
while ((remaining > 0) && ((rd = ourRead(&file->ios.net, sd, buf, remaining)) > 0))
{
total += rd;
buf += rd;
remaining -= rd;
}
if (rd == -1)
errnoAbort("udcDataViaHttpOrFtp: error reading socket");
struct connInfo *ci = &file->connInfo;
if (ci == NULL)
mustCloseFd(&sd);
else
ci->offset += total;
return total;
}
boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo)
/* Gets size and last modified time of URL
* and returns status of HEAD or GET byterange 0-0. */
{
verbose(4, "checking http remote info on %s\n", url);
// URLs passed into here should not have byterange clause.
int redirectCount = 0;
struct hash *hash;
int status;
char *sizeString = NULL;
/*
For caching, sites should support byte-range and last-modified.
However, several groups including ENCODE have made sites that use CGIs to
dynamically generate hub text files such as hub.txt, genome.txt, trackDb.txt.
Byte-range and last-modified are difficult to support for this case,
so they do without them, effectively defeat caching. Every 5 minutes (udcTimeout),
they get re-downloaded, even when the data has not changed.
*/
while (TRUE)
{
hash = newHash(0);
status = netUrlHead(url, hash);
sizeString = hashFindValUpperCase(hash, "Content-Length:");
if (status == 200 && sizeString)
break;
/*
Using HEAD with HIPPAA-compliant signed AmazonS3 URLs generates 403.
The signed URL generated for GET cannot be used with HEAD.
Instead call GET with byterange=0-0 in netUrlFakeHeadByGet().
This supplies both size via Content-Range response header,
as well as Last-Modified header which is important for caching.
There are also sites which support byte-ranges
but they do not return Content-Length with HEAD.
*/
if (status == 403 || (status==200 && !sizeString))
{
hashFree(&hash);
hash = newHash(0);
status = netUrlFakeHeadByGet(url, hash);
if (status == 206)
break;
if (status == 200) // helps get more info to user
break;
}
if (status != 301 && status != 302 && status != 307 && status != 308)
return FALSE;
++redirectCount;
if (redirectCount > 5)
{
warn("code %d redirects: exceeded limit of 5 redirects, %s", status, url);
return FALSE;
}
char *newUrl = hashFindValUpperCase(hash, "Location:");
if (!newUrl)
{
warn("code %d redirects: redirect location missing, %s", status, url);
return FALSE;
}
// path may be relative
if (hasProtocol(newUrl))
{
newUrl = cloneString(newUrl);
}
else
{
newUrl = expandUrlOnBase(url, newUrl);
}
retInfo->ci.redirUrl = newUrl;
url = transferParamsToRedirectedUrl(url, newUrl);
hashFree(&hash);
}
char *sizeHeader = NULL;
if (status == 200)
{
sizeHeader = "Content-Length:";
// input pattern: Content-Length: 2738262
}
if (status == 206)
{
sizeHeader = "Content-Range:";
// input pattern: Content-Range: bytes 0-99/2738262
}
sizeString = hashFindValUpperCase(hash, sizeHeader);
if (sizeString)
{
char *parseString = sizeString;
if (status == 206)
{
parseString = strchr(sizeString, '/');
if (!parseString)
{
warn("Header value %s is missing '/' in %s in response for url %s",
sizeString, sizeHeader, url);
return FALSE;
}
++parseString; // skip past slash
}
if (parseString)
{
retInfo->size = atoll(parseString);
}
else
{
warn("Header value %s is missing or invalid in %s in response for url %s",
sizeString, sizeHeader, url);
return FALSE;
}
}
else
{
warn("Response is missing required header %s for url %s", sizeHeader, url);
return FALSE;
}
char *lastModString = hashFindValUpperCase(hash, "Last-Modified:");
if (lastModString == NULL)
{
// Date is a poor substitute! It will always appear that the cache is stale.
// But at least we can read files from dropbox.com.
lastModString = hashFindValUpperCase(hash, "Date:");
if (lastModString == NULL)
{
hashFree(&hash);
errAbort("No Last-Modified: or Date: returned in header for %s, can't proceed, sorry", url);
}
}
struct tm tm;
time_t t;
// Last-Modified: Wed, 15 Nov 1995 04:58:08 GMT
// This will always be GMT
if (strptime(lastModString, "%a, %d %b %Y %H:%M:%S %Z", &tm) == NULL)
{ /* Handle error */;
hashFree(&hash);
errAbort("unable to parse last-modified string [%s]", lastModString);
}
t = mktimeFromUtc(&tm);
if (t == -1)
{ /* Handle error */;
hashFree(&hash);
errAbort("mktimeFromUtc failed while converting last-modified string [%s] from UTC time", lastModString);
}
retInfo->updateTime = t;
hashFree(&hash);
return status;
}
/********* Section for ftp protocol **********/
// fetchData method: See udcDataViaHttpOrFtp above.
boolean udcInfoViaFtp(char *url, struct udcRemoteFileInfo *retInfo)
/* Gets size and last modified time of FTP URL */
{
verbose(4, "checking ftp remote info on %s\n", url);
long long size = 0;
time_t t, tUtc;
struct tm *tm = NULL;
// TODO: would be nice to add int *retCtrlSocket to netGetFtpInfo so we can stash
// in retInfo->connInfo and keep socket open.
boolean ok = netGetFtpInfo(url, &size, &tUtc);
if (!ok)
return FALSE;
// Convert UTC to localtime
tm = localtime(&tUtc);
t = mktimeFromUtc(tm);
if (t == -1)
{ /* Handle error */;
errAbort("mktimeFromUtc failed while converting FTP UTC last-modified time %ld to local time", (long) tUtc);
}
retInfo->size = size;
retInfo->updateTime = t;
return TRUE;
}
/********* Non-protocol-specific bits **********/
boolean udcFastReadString(struct udcFile *f, char buf[256])
/* Read a string into buffer, which must be long enough
* to hold it. String is in 'writeString' format. */
{
UBYTE bLen;
int len;
if (!udcReadOne(f, bLen))
return FALSE;
if ((len = bLen)> 0)
udcMustRead(f, buf, len);
buf[len] = 0;
return TRUE;
}
void msbFirstWriteBits64(FILE *f, bits64 x);
static char *fileNameInCacheDir(struct udcFile *file, char *fileName)
/* Return the name of a file in the cache dir, from the cache root directory on down.
* Do a freeMem on this when done. */
{
int dirLen = strlen(file->cacheDir);
int nameLen = strlen(fileName);
char *path = needMem(dirLen + nameLen + 2);
memcpy(path, file->cacheDir, dirLen);
path[dirLen] = '/';
memcpy(path+dirLen+1, fileName, nameLen);
return path;
}
static void udcNewCreateBitmapAndSparse(struct udcFile *file,
bits64 remoteUpdate, bits64 remoteSize, bits32 version)
/* Create a new bitmap file around the given remoteUpdate time. */
{
int fd = mustOpenFd(file->bitmapFileName, O_WRONLY | O_CREAT | O_TRUNC);
bits32 sig = udcBitmapSig;
bits32 blockSize = udcBlockSize;
bits64 reserved64 = 0;
bits32 reserved32 = 0;
int blockCount = (remoteSize + udcBlockSize - 1)/udcBlockSize;
int bitmapSize = bitToByteSize(blockCount);
/* Write out fixed part of header. */
writeOneFd(fd, sig);
writeOneFd(fd, blockSize);
writeOneFd(fd, remoteUpdate);
writeOneFd(fd, remoteSize);
writeOneFd(fd, version);
writeOneFd(fd, reserved32);
writeOneFd(fd, reserved64);
writeOneFd(fd, reserved64);
writeOneFd(fd, reserved64);
writeOneFd(fd, reserved64);
long long offset = ourMustLseek(&file->ios.bit, fd, 0, SEEK_CUR);
if (offset != udcBitmapHeaderSize)
errAbort("offset in fd=%d, f=%s is %lld, not expected udcBitmapHeaderSize %d",
fd, file->bitmapFileName, offset, udcBitmapHeaderSize);
/* Write out initial all-zero bitmap, using sparse-file method: write 0 to final address. */
unsigned char zero = 0;
ourMustLseek(&file->ios.bit, fd, bitmapSize-1, SEEK_CUR);
ourMustWrite(&file->ios.bit, fd, &zero, 1);
/* Clean up bitmap file and name. */
mustCloseFd(&fd);
/* Create an empty data file. */
fd = mustOpenFd(file->sparseFileName, O_WRONLY | O_CREAT | O_TRUNC);
mustCloseFd(&fd);
}
static struct udcBitmap *udcBitmapOpen(char *fileName)
/* Open up a bitmap file and read and verify header. Return NULL if file doesn't
* exist, abort on error. */
{
/* Open file, returning NULL if can't. */
int fd = open(fileName, O_RDWR);
if (fd < 0)
{
if (errno == ENOENT)
return NULL;
else
errnoAbort("Can't open(%s, O_RDWR)", fileName);
}
/* Get status info from file. */
struct stat status;
fstat(fd, &status);
if (status.st_size < udcBitmapHeaderSize) // check for truncated invalid bitmap files.
{
close(fd);
return NULL; // returning NULL will cause the fresh creation of bitmap and sparseData files.
}
/* Read signature and decide if byte-swapping is needed. */
// TODO: maybe buffer the I/O for performance? Don't read past header -
// fd offset needs to point to first data block when we return.
bits32 magic;
boolean isSwapped = FALSE;
mustReadOneFd(fd, magic);
if (magic != udcBitmapSig)
{
magic = byteSwap32(magic);
isSwapped = TRUE;
if (magic != udcBitmapSig)
errAbort("%s is not a udcBitmap file", fileName);
}
/* Allocate bitmap object, fill it in, and return it. */
struct udcBitmap *bits;
AllocVar(bits);
bits->blockSize = fdReadBits32(fd, isSwapped);
bits->remoteUpdate = fdReadBits64(fd, isSwapped);
bits->fileSize = fdReadBits64(fd, isSwapped);
bits->version = fdReadBits32(fd, isSwapped);
fdReadBits32(fd, isSwapped); // ignore result
fdReadBits64(fd, isSwapped); // ignore result
fdReadBits64(fd, isSwapped); // ignore result
fdReadBits64(fd, isSwapped); // ignore result
fdReadBits64(fd, isSwapped); // ignore result
bits->localUpdate = status.st_mtime;
bits->localAccess = status.st_atime;
bits->isSwapped = isSwapped;
bits->fd = fd;
return bits;
}
static void udcBitmapClose(struct udcBitmap **pBits)
/* Free up resources associated with udcBitmap. */
{
struct udcBitmap *bits = *pBits;
if (bits != NULL)
{
mustCloseFd(&(bits->fd));
freez(pBits);
}
}
static struct udcProtocol *udcProtocolNew(char *upToColon)
/* Build up a new protocol around a string such as "http" or "local" */
{
struct udcProtocol *prot;
AllocVar(prot);
if (sameString(upToColon, "local"))
{
prot->fetchData = udcDataViaLocal;
prot->fetchInfo = udcInfoViaLocal;
prot->type = "local";
}
else if (sameString(upToColon, "slow"))
{
prot->fetchData = udcDataViaSlow;
prot->fetchInfo = udcInfoViaSlow;
prot->type = "slow";
}
else if (sameString(upToColon, "http") || sameString(upToColon, "https"))
{
prot->fetchData = udcDataViaHttpOrFtp;
prot->fetchInfo = udcInfoViaHttp;
prot->type = "http";
}
else if (sameString(upToColon, "ftp"))
{
prot->fetchData = udcDataViaHttpOrFtp;
prot->fetchInfo = udcInfoViaFtp;
prot->type = "ftp";
}
else if (sameString(upToColon, "transparent"))
{
prot->fetchData = udcDataViaTransparent;
prot->fetchInfo = udcInfoViaTransparent;
prot->type = "transparent";
}
else
{
errAbort("Unrecognized protocol %s in udcProtNew", upToColon);
}
return prot;
}
static void udcProtocolFree(struct udcProtocol **pProt)
/* Free up protocol resources. */
{
freez(pProt);
}
static void setInitialCachedDataBounds(struct udcFile *file, boolean useCacheInfo)
/* Open up bitmap file and read a little bit of it to see if cache is stale,
* and if not to see if the initial part is cached. Sets the data members
* startData, and endData. If the case is stale it makes fresh empty
* cacheDir/sparseData and cacheDir/bitmap files. */
{
bits32 version = 0;
/* Get existing bitmap, and if it's stale clean up. */
struct udcBitmap *bits = udcBitmapOpen(file->bitmapFileName);
if (bits != NULL)
{
if (useCacheInfo)
{
file->size = bits->fileSize;
file->updateTime = bits->remoteUpdate;
}
version = bits->version;
if (bits->remoteUpdate != file->updateTime || bits->fileSize != file->size ||
!fileExists(file->sparseFileName) ||
(fileSize(file->sparseFileName) == 0 && file->size > 0 && fileSize(file->bitmapFileName) > udcBitmapHeaderSize))
{
verbose(4, "removing stale version (%lld! = %lld or %lld! = %lld or %s doesn't exist or should not be size 0), "
"new version %d\n",
bits->remoteUpdate, (long long)file->updateTime, bits->fileSize, file->size,
file->sparseFileName, version);
udcBitmapClose(&bits);
remove(file->bitmapFileName);
remove(file->sparseFileName);
if (fileExists(file->redirFileName))
remove(file->redirFileName);
++version;
}
}
else
verbose(4, "bitmap file %s does not already exist, creating.\n", file->bitmapFileName);
/* If no bitmap, then create one, and also an empty sparse data file. */
if (bits == NULL)
{
udcNewCreateBitmapAndSparse(file, file->updateTime, file->size, version);
bits = udcBitmapOpen(file->bitmapFileName);
if (bits == NULL)
errAbort("Unable to open bitmap file %s", file->bitmapFileName);
}
file->bitmapVersion = bits->version;
/* Read in a little bit from bitmap while we have it open to see if we have anything cached. */
if (file->size > 0)
{
Bits b;
off_t wasAt = lseek(bits->fd, 0, SEEK_CUR);
mustReadOneFd(bits->fd, b);
int endBlock = (file->size + udcBlockSize - 1)/udcBlockSize;
if (endBlock > 8)
endBlock = 8;
int initialCachedBlocks = bitFindClear(&b, 0, endBlock);
file->endData = initialCachedBlocks * udcBlockSize;
ourMustLseek(&file->ios.bit, bits->fd, wasAt, SEEK_SET);
}
file->bits = bits;
}
static boolean qEscaped(char c)
/* Returns TRUE if character needs to be escaped in q-encoding. */
{
if (isalnum(c))
return c == 'Q';
else
return c != '_' && c != '-' && c != '/' && c != '.';
}
static char *qEncode(char *input)
/* Do a simple encoding to convert input string into "normal" characters.
* Abnormal letters, and '!' get converted into Q followed by two hexadecimal digits. */
{
/* First go through and figure out encoded size. */
int size = 0;
char *s, *d, c;
s = input;
while ((c = *s++) != 0)
{
if (qEscaped(c))
size += 3;
else
size += 1;
}
/* Allocate and fill in output. */
char *output = needMem(size+1);
s = input;
d = output;
while ((c = *s++) != 0)
{
if (qEscaped(c))
{
sprintf(d, "Q%02X", (unsigned)c);
d += 3;
}
else
*d++ = c;
}
return output;
}
void udcParseUrlFull(char *url, char **retProtocol, char **retAfterProtocol, char **retColon,
char **retAuth)
/* Parse the URL into components that udc treats separately.
* *retAfterProtocol is Q-encoded to keep special chars out of filenames.
* Free all *ret's except *retColon when done. */
{
char *protocol, *afterProtocol;
char *colon = strchr(url, ':');
if (!colon)
{
*retColon = NULL;
return;
}
int colonPos = colon - url;
protocol = cloneStringZ(url, colonPos);
afterProtocol = url + colonPos + 1;
while (afterProtocol[0] == '/')
afterProtocol += 1;
char *userPwd = strchr(afterProtocol, '@');
if (userPwd)
{
if (retAuth)
{
char auth[1024];
safencpy(auth, sizeof(auth), afterProtocol, userPwd+1-afterProtocol);
*retAuth = qEncode(auth);
}
char *afterHost = strchr(afterProtocol, '/');
if (!afterHost)
{
afterHost = afterProtocol+strlen(afterProtocol);