-
Notifications
You must be signed in to change notification settings - Fork 1
/
caches.py
1298 lines (1049 loc) · 50 KB
/
caches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import pathlib
import warnings
from augpathlib import exceptions as exc
from augpathlib.meta import PathMeta
from augpathlib.core import AugmentedPath, EatPath
from augpathlib.utils import log, default_cypher
from augpathlib.utils import LOCAL_DATA_DIR, SPARSE_MARKER
from augpathlib import remotes
class CachePath(AugmentedPath):
# CachePaths this needs to be a real path so that it can navigate the local path sturcture
# FIXME Not sure I believe that, given the tradeoff
""" Local data about remote objects.
This is where the mapping between the local id (aka path)
and the remote id lives. In a git-like world this is the
cache/index/whatever we call it these days
This is the bridge class that holds the mappings.
Always start bootstrapping from one of these classes
since it has both the local and remote identifiers,
and therefore can be called and used before specifying
the exact implementations for the local and remote objects.
"""
_local_data_dir = LOCAL_DATA_DIR
_sparse_marker = SPARSE_MARKER
cache_ignore = _local_data_dir, _sparse_marker, '.git', # TODO
_local_class = None
_remote_class_factory = None
_backup_cache = None
_not_exists_cache = None
def __enter__(self):
if self.is_dir():
self._entered_from = self.local.cwd() # caches can't exist outside their anchor anymore
self.chdir()
return self
else:
super().__enter__(self)
@classmethod
def setup(cls, local_class, remote_class_factory):
""" call this once to bind everything together """
cn = self.__class__.__name__
warnings.warn(f'{cn}.setup is deprecated please switch to RemotePath._new',
DeprecationWarning,
stacklevel=2)
cls._local_class = local_class
cls._remote_class_factory = remote_class_factory
local_class._cache_class = cls
remote_class_factory._cache_class = cls
# a nice side effect of weighing anchor here is that it
# enforces order of operations for setup then init etc.
if hasattr(cls, '_anchor'):
cls.weighAnchor()
@classmethod
def weighAnchor(cls):
# return a value to indicate that there was an anchor since
# we no longer error when already underway
acls = cls._abstract_class()
if hasattr(acls, '_anchor'):
return delattr(acls, '_anchor')
def anchorClassHere(self, remote_init=True):
""" Use this to initialize the class level anchor from an instance. """
# FIXME WARNING you can shoot yourself in the foot with this if
# there is another anchor further up the tree from this one
# FIXME further, this means that there has to have been a way
# to construct a live CachePath by sideloading the remote id
# which is OK for the path where local/remote binding has already
# been completed
if not hasattr(self.__class__, '_anchor'):
self.__class__._abstract_class()._anchor = self
self.local_data_dir_init() # call every time for insurance
self._remote_class.anchorToCache(self, init=remote_init)
else:
raise ValueError(f'{self.__class__} already anchored to {self.__class__._anchor}')
@property
def local_class(self):
if self.is_helper_cache:
return self._cache_parent.local_class
return self._local_class
def __new__(cls, *args, meta=None, remote=None, **kwargs):
# TODO do we need a version of this for the locals
# and remotes? I don't think we create 'alternate' remotes or locals ...
self = super().__new__(cls, *args, **kwargs)
# clone any existing locals and remotes
if args:
path = args[0]
if isinstance(path, CachePath):
self._cache_parent = path
if hasattr(self._cache_parent, '_in_bootstrap'):
# it is ok to do this and not clean up because
# child caches are ephemoral
self._in_bootstrap = self._cache_parent._in_bootstrap
if path.local is not None: # this might be the very fist time local is called
pass # we don't use self._local anymore too many issues
elif isinstance(path, remotes.RemotePath):
#self._remote = path
#self.meta = path.meta
# in order for this to work the remote has to already
# know where the cache should live, which it doesn't
# use move instead for cases where the semantics are well defined
raise TypeError('Not entirely sure what to do in this case ...')
return self
def __init__(self, *args, meta=None, remote=None, **kwargs):
if remote:
self._remote = remote
self._meta_setter(remote.meta)
elif meta:
self._meta_updater(meta)
else:
path = args[0]
if self.meta is None:
raise exc.NoCachedMetadataError(self.local)
elif isinstance(path, LocalPath):
path._cache = self
super().__init__()
@property
def anchor(self):
raise NotImplementedError('You need to define the rule for determining '
'the local cache root for \'remote\' paths. '
'These are sort of like pseudo mount points.')
@property
def trash(self):
raise NotImplementedError('implement in subclasses')
# FIXME mkdir and put it in a more conventional location
@property
def _trashed_path(self):
return self.trash / f'{self.parent.id}-{self.id}-{self.name}'
@property
def _trashed_path_short(self):
return self.trash / self.name # FIXME SIGH
def crumple(self): # FIXME change name to something more obvious ...
try:
self.rename(self._trashed_path)
except OSError as e:
if e.errno == 36: # File name too long # SIGH
log.critical(f'Had to rename trash {self._trashed_path} -> {self._trashed_path_short}')
self.rename(self._trashed_path_short)
else:
raise e
return trashed
@property
def local_data_dir(self):
return self.anchor.local / self._local_data_dir
@property
def local_objects_dir(self):
""" sort of like .git/objects """
return self.local_data_dir / 'objects'
@property
def local_object_cache_path(self):
# FIXME probably need the 2 char directory convention
# to limit directory size
return self.local_objects_dir / self.cache_key
def local_data_dir_init(self, exist_ok=True, symlink_objects_to=None):
# FIXME shouldn't this always run once the
# first time a class is initialized and then
# modify new to never call it again?
self.local_data_dir.mkdir(exist_ok=exist_ok)
lod = self.local_objects_dir
if symlink_objects_to is not None:
# NOTE this interacts with anchorClassHere
# we do want to always run this local_data_dir_init
# to avoid hard to debug errors, which means that if
# a user wants to override, we have to rmdir first
# NOTE we do NOT rmtree here, the user needs to to that
# explicitly so we don't accidentally remove everything
# NOTE this whole design is dumb, and in cases where
# there are just remote objects we should be stashing
# them in ~/.cache or similar
if self.local_objects_dir.is_symlink():
msg = f'{lod} is already symlinked to {lod.readlink()}'
raise NotADirectoryError(msg)
elif lod.exists():
lod.rmdir()
lod.symlink_to(symlink_objects_to)
else:
lod.mkdir(exist_ok=exist_ok)
try:
self.trash.mkdir(exist_ok=exist_ok)
except NotImplementedError:
# if there's no trash, there's no trash
pass
@property
def is_helper_cache(self):
return hasattr(self, '_cache_parent')
def __truediv__(self, key, update_meta=True):
# basically RemotePaths are like relative CachePaths ... HRM
# they are just a name and an id ... the id of their parent
# root needs to match the id of the cache ... which it usually
# does by construction
parent = self.parent if self.parent.meta is not None else self # FIXME should be in def parent ???
if isinstance(key, remotes.RemotePath):
# FIXME not just names but relative paths???
remote = key
try:
child = self._make_child(
remote._parts_relative_to(self.remote, parent),
remote, update_meta=update_meta)
except AttributeError as e:
raise exc.AugPathlibError('aaaaaaaaaaaaaaaaaaaaaa') from e
return child
elif isinstance(key, str):
child = self.local / key
if child.exists() or child.is_broken_symlink():
return child.cache
else:
raise FileNotFoundError('There is no local cached file with that name. Cannot construct cache.')
else:
pass # error incoming
raise TypeError('Cannot construct a new CacheClass from an object '
f'without an id and a name! {key}')
def __rtruediv__(self, cache):
""" key is a subclass of self.__class__ """
# I assume that this happens when a cache is constructed from
# an relative cache?
out = self._from_parts([cache.name] + self._parts, init=False)
out._init()
cache.remote._cache_setter(out) # this seems more correct?
#out._meta_setter(cache.meta)
return out
def _make_child(self, args, remote, update_meta=True):
drv, root, parts = self._parse_args(args)
drv, root, parts = self._flavour.join_parsed_parts(
self._drv, self._root, self._parts, drv, root, parts)
child = self._from_parsed_parts(drv, root, parts, init=False) # short circuits
child._init()
if isinstance(remote, remotes.RemotePath):
remote._cache_setter(child, update_meta=update_meta)
else:
raise ValueError('should not happen')
return child
def bootstrap(self, meta, *,
parents=False,
recursive=False,
fetch_data=False,
size_limit_mb=2,
only=tuple(),
skip=tuple(),
sparse=tuple(),):
try:
self._in_bootstrap = True
return list(self._bootstrap(meta,
parents=parents,
recursive=recursive,
fetch_data=fetch_data,
size_limit_mb=size_limit_mb,
only=only,
skip=skip,
sparse=sparse,))
finally:
delattr(self, '_in_bootstrap')
if hasattr(self, '_meta'):
delattr(self, '_meta')
if hasattr(self, '_is_sparse_root'):
self._mark_sparse()
delattr(self, '_is_sparse_root')
def _sparse_root(self): # TODO consider SparseHelper
return self != self.parent and (self / self._sparse_marker).exists()
def is_sparse(self):
return self._sparse_root() is not None
def _clear_sparse(self):
mark = self.local / self._sparse_marker
mark.unlink()
def _mark_sparse(self):
""" default implementation for marking folders as sparse
this uses a file in the folder, but a better implementation
if one has access to xattrs is to use those instead
as such an implementation of _mark_sparse is also provided
on EatPath """
mark = self.local / self._sparse_marker
mark.touch()
def _meta_is_root(self, meta):
""" used to identify the root during bootstrap """
raise NotImplementedError('implement in subclass')
def _bootstrap(self, meta, *,
parents=False,
fetch_data=False,
size_limit_mb=2,
recursive=False,
only=tuple(),
skip=tuple(),
sparse=tuple(),):
""" The actual bootstrap implementation """
# figure out if we are actually bootstrapping this class or skipping it
if not meta or meta.id is None:
raise exc.BootstrappingError(f'PathMeta to bootstrap from has no id! {meta}')
if only or skip or sparse:
if self._meta_is_root(meta):
# since we only go one organization at a time right now
# we never want to skip the top level id
log.info(f'Bootstrapping {meta.id} -> {self.local!r}')
elif meta.id in skip:
log.info(f'Skipped {meta.id} since it is in skip')
return
elif only and meta.id not in only:
log.info(f'Skipped {meta.id} since it is not in only')
return
else:
if sparse and meta.id in sparse:
log.info(f'Sparse strap {meta.id} -> {self.local!r}')
self._is_sparse_root = True
sparse = True
else:
# if you pass the only mask so do your children
log.info(f'Bootstrapping {meta.id} -> {self.local!r}')
only = tuple()
if self.meta is not None and not recursive:
msg = f'{self} already has meta!\n{self.meta.as_pretty()}'
raise exc.BootstrappingError(msg)
if self.exists() and self.meta and self.meta.id == meta.id:
self._meta_updater(meta)
else:
# set single use bootstrapping id
self._bootstrapping_id = meta.id
# directory, file, or fake file as symlink?
is_file_and_fetch_data = self._bootstrap_prepare_filesystem(parents,
fetch_data,
size_limit_mb,
sparse,)
is_file_and_fetch_data = False # XXX NOTE _bootstrap_prepare_filesystem always returns None
# remove this static assignment to False if there is a use case for bootstrapping the data
self._bootstrap_data(is_file_and_fetch_data)
if recursive: # ah the irony of using loops to do this
yield from self._bootstrap_recursive(only, skip, sparse)
yield self
def _bootstrap_recursive(self, only=tuple(), skip=tuple(), sparse=False):
# TODO if rchildren looks like it could be bad
# go back up to dataset level?
#sname = lambda gen: sorted(gen, key=lambda c: c.name) # c.name doesn't work for remotes
#rcs = sname(self.remote._rchildren(create_cache=False, sparse=sparse))
rcs = self.remote._rchildren(create_cache=False, sparse=sparse)
local_paths = list(self.local.rchildren)
local_files = set(p for p in local_paths if p.is_file() or p.is_broken_symlink())
file_index = {f.cache_id:f for f in local_files} # FIXME WARNING can get big
# FIXME have to compute file_index here because for some reason
# computing local_dirs will remove folders entirely !??
local_dirs = set(p.relative_to(self.anchor) for p in local_paths if p.is_dir())
if local_dirs:
rcs = list(rcs) # sigh
remote_dirs = set(c for c in rcs if c.is_dir())
rd = set(d.as_path() for d in remote_dirs) # FIXME as_path => lots of network calls
old_local = local_dirs - rd
while old_local:
thisl = sorted(old_local, key=lambda d: len(d.as_posix()))
for d in thisl:
ad = self.anchor.local / d
if ad.cache is None:
log.critical(f'would you fix the nullability already? {d}')
continue
new = ad.cache.refresh()
#log.info(f'{new}')
local_dirs = set(ld for ld in local_dirs
if not ld.as_posix().startswith(d.as_posix()))
old_local = local_dirs - rd
if sparse:
#if local_dirs:
#gen = (c for c in _local_remotes if c.is_dir() or (c.is_file() and c._sparse_include()))
#else:
gen = (c for c in rcs if c.is_dir() or (c.is_file() and c._sparse_include()))
# FIXME rcs still takes too long, though using the generator
# does get some useful work done first
else:
# FIXME horrid performance on remotes with loads of files
gen = sorted(rcs, key=lambda c: len(c.as_path().as_posix()))
for child in gen:
# use the remote's recursive implementation
# not the local implementation, since the
# remote may have additional requirements
#child.bootstrap(only=only, skip=skip)
# because of how remote works now we don't even have to
# bootstrap this
cc = child.cache
if cc is None:
if child.is_file() and child.id in file_index:
_cache = file_index[child.id].cache
cmeta = _cache.meta
rmeta = child.meta
file_is_different, nmeta = self._update_meta(cmeta, rmeta)
if file_is_different:
log.critical(f'WAT {_cache}')
else:
yield _cache
# yield the old cache if it exists
# otherwise consumers of bootstrap will
# think the file may have been deleted
continue
cc = child.cache_init()
log.debug(cc)
yield cc
def _bootstrap_prepare_filesystem(self, parents, fetch_data, size_limit_mb, sparse=False):
# we could use bootstrapping id here and introspect the id, but that is cheating
if self.remote.is_dir():
if not sparse and not self.exists():
# the bug where this if statement put in as an and is a really
# good example of how case/cond etc help you reasona about what
# a block of branches is really doing -- this one was implementing
# a covering set which is not obvious if implemented this way
# you could do this with a dict or something else in pythong
# bit it is awkward (see also my crazy case implementation in interlex)
self.mkdir(parents=parents)
elif self.remote.is_file():
if sparse and not self._sparse_include():
return
if not self.parent.exists():
self.parent.mkdir(parents=parents)
toucha_da_filey = (fetch_data and
self.meta.size is not None and
self.meta.size.mb < size_limit_mb)
if toucha_da_filey:
self.touch()
# running this first means that we will use xattrs instead of symlinks
# this is a bit opaque, but since meta uses a setter we can't pass a
# param to make it clear (oh look, python being dumb again!)
else:
pass # we are using symlinks
else:
raise BaseException(f'Remote is not a file or directory {self}')
def _bootstrap_data(self, is_file_and_fetch_data=False):
""" XXX UNUSED """
if is_file_and_fetch_data:
if self.remote.meta.size is None:
self.remote.refresh(update_cache=True)
self.local.data = self.remote.data
# with open -> write should not cause the inode to change
self.validate_file()
def _sparse_include(self):
raise NotImplementedError('implement in subclass')
def validate_file(self):
meta = self.meta
if meta.etag:
local_checksum, local_count = self.local.etag(meta.chunksize)
cache_checksum, cache_count = meta.etag
if local_checksum != cache_checksum or local_count != cache_count:
msg = (f'etags do not match!\n(!='
f'\n{local_checksum}-{local_count}'
f'\n{cache_checksum}-{cache_count}\n)')
log.critical(msg)
elif meta.checksum:
lc = self.local.meta.checksum
cc = self.meta.checksum
if lc != cc:
msg = f'Checksums do not match!\n(!=\n{lc}\n{cc}\n)'
log.critical(msg) # haven't figured out how to comput the bf checksums yet
#raise exc.ChecksumError(msg)
elif meta.size is not None:
log.warning(f'No checksum! Your data is at risk!\n'
f'{self.remote!r} -> {self.local!r}! ')
ls = self.local.meta.size
cs = self.meta.size
if ls != cs:
raise exc.SizeError(f'Sizes do not match!\n(!=\n{ls}\n{cs}\n)')
else:
log.warning(f'No checksum and no size! Your data is at risk!\n'
'{self.remote!r} -> {self.local!r}! ')
@property
def remote(self):
if hasattr(self, '_remote'):
return self._remote
if hasattr(self, '_cache_parent'):
return self._cache_parent.remote
id = self.id # bootstrapping id is a one time use so keep it safe
if id is None: # zero is a legitimate identifier
return
anchor = self.anchor
if anchor is None: # the very first ...
# in which case we need the id for factory AND class
self._bootstrapping_id = id # so we set it again
anchor = self # could double check if the id has the info too ...
# FIXME remove?
if self._remote_class_factory is not None or (hasattr(self, '_remote_class') and
self._remote_class is not None):
# we don't have to have a remote configured to check the cache
if not hasattr(self, '_remote_class'):
#log.debug('rc')
# NOTE there are many possible ways to set the anchor
# we need to pick _one_ of them
self._remote_class = self._remote_class_factory(anchor,
self.local_class)
if (hasattr(self._remote_class, '_api_class') and
not hasattr(self._remote_class, '_api')):
# FIXME SIGH this should be one shot not a hasattr check
# every first time we have to get the remote for a cache
self._remote_class.anchorToCache(anchor)
if not hasattr(self, '_remote'):
self._remote = self._remote_class(id, cache=self)
return self._remote
@property
def local(self):
local = self.local_class(self)
if self.is_helper_cache:
cache = self._cache_parent
else:
cache = self
local._cache = cache
return local
def dedupe(self, other, pretend=False):
# FIXME blackfynn doesn't set update when a folder name changes ??!
if self.id != other.id:
raise ValueError('Can only dedupe when ids match, {self.id} != {other.id}')
su, ou = self.meta.updated, other.meta.updated
lsu, lou = self.local.meta.updated, other.local.meta.updated
if su < ou:
old, new = self, other
elif su > ou:
new, old = self, other
elif lsu is None and lou is None:
new, old = self, other
elif lsu is None:
old, new = self, other
elif lou is None:
new, old = self, other
elif lsu < lou:
old, new = self, other
elif lsu > lou:
new, old = self, other
else: # ==
ss, os = self.meta.size, other.meta.size
if ss is not None and os is not None:
new, old = self, other
elif ss is None:
old, new = self, other
elif os is None:
new, old = self, other
else:
raise BaseException('how did we get here!?')
file_is_different, meta = self._update_meta(old.meta, new.meta)
if file_is_different:
log.info(f'{self}\n!=\n{other}\n{meta}')
if not pretend:
#old.rename('/dev/null') # hah
pass
return new
# TODO go look in meta for this
# check updated ... etc.
# missing size
# missing file_id
@property
def id(self):
if not hasattr(self, '_id'): # calls to self.exists() are too expensive for this nonsense
if self.meta:
self._id = self.meta.id
return self._id
elif hasattr(self, '_bootstrapping_id'):
id = self._bootstrapping_id
delattr(self, '_bootstrapping_id') # single use only
return id
else:
return
return self._id
@property
def cache_key(self):
""" since some systems have compound ids ... """
raise NotImplementedError
# TODO how to toggle fetch from remote to heal?
@property
def meta(self):
raise NotImplementedError
if hasattr(self, '_meta'):
return self._meta # for bootstrap
def _meta_setter(self, pathmeta, memory_only=False):
""" so much for the pythonic way when the language won't even let you """
if not memory_only:
raise TypeError('You must explicitly set memory_only=True to use this '
'otherwise you risk dataloss.')
if self.meta and self.id != pathmeta.id:
raise exc.MetadataIdMismatchError('Cache id does not match meta id! '
f'{self.id} != {pathmeta.id}\n{pathmeta}')
self._meta = pathmeta
def recover_meta(self):
""" rebuild restore reconnect """
root = self.parent.local.find_cache_root()
if root is None:
#breakpoint()
raise exc.NotInProjectError(f'{self.parent.local} is not in a project!')
breakpoint()
raise NotImplementedError()
children = list(self.parent.remote.children) # if this is run from dismatch meta we have issues
isf = self.is_file()
isd = self.is_dir()
candidates = []
def inner(child):
if child.is_dir() and isd:
if child.name == self.name:
self.meta = child.meta
return
elif child.is_file() and isf:
log.debug(f'{child.name} {child.stem}, {child.suffix!r}')
log.debug(f'{self.name} {self.stem}, {self.suffix!r}')
if child.name == self.name:
self.meta = child.meta
elif child.name == self.stem:
candidates.append(child)
elif child.stem == self.name:
candidates.append(child)
elif child.stem == self.stem:
# worst cases
candidates.append(child)
else:
#log.critical('file type mismatch')
pass
for child in children:
inner(child)
# it looks like if we do fail over to retrieving a package it does go to files
# so this is an ok approach and we don't have to deal with that at this level
if not candidates:
wat = '\n'.join(c.name for c in children)
message = (f'We seem to have lost {self.parent} -/-> {self.name}'
f'\n{self.parent.uri_human}\n{wat}\n{self.name}')
log.critical(message)
dataset = self.dataset
maybe = []
for c in self.dataset.remote.rchildren:
if c.parent and c.parent.id == self.parent.id or c.stem == self.stem:
maybe.append(c)
[inner(m) for m in maybe]
#candidates
#dataset.bootstrap(dataset.meta, recursive=True)
#raise exc.NoRemoteMappingError
elif len(candidates) == 1:
remote = candidates[0]
log.critical('How did we write this file without moving it beforhand?!?\n'
f'{self.local} -/-> {remote.name}')
self.meta = remote.meta # go ahead and set this even though we change?
self.move(remote=remote)
else:
raise BaseException('multiple candidates!')
def refresh(self, update_data=False, size_limit_mb=2, force=False):
if self.meta is None:
breakpoint()
limit = (size_limit_mb if
not self.meta.size or (size_limit_mb > self.meta.size.mb)
else self.meta.size.mb + 1)
new = self.remote.refresh(update_cache=True,
update_data=update_data,
update_data_on_cache=(self.is_file() and self.exists()),
size_limit_mb=size_limit_mb,
force=force)
if new is not None:
return new
else:
log.info(f'Remote for {self} has been deleted. Moving to trash.')
try:
self.rename(self.trash / f'{self.parent.id}-{self.id}-{self.name}')
except FileNotFoundError as e:
if not self.trash.exists():
self.trash.mkdir()
log.info('created {self.trash}')
else:
raise e
def fetch(self, size_limit_mb=2):
""" bypass remote to fetch directly based on stored meta """
meta = self.meta
if self.is_dir():
raise NotImplementedError('not going to fetch all data in a dir at the moment')
if meta.file_id is None:
self.refresh(update_data=True, force=True)
# the file name could be different so we have to return here
return
size_ok = size_limit_mb is not None and meta.size is not None and meta.size.mb < size_limit_mb
size_not_ok = size_limit_mb is not None and meta.size is not None and meta.size.mb > size_limit_mb
if size_ok or size_limit_mb is None: # FIXME should we force fetch here by default if the file exists?
if self.is_broken_symlink():
# FIXME touch a temporary file and set the meta first!
self.unlink()
self.touch()
self._meta_setter(meta)
log.info(f'Fetching remote via cache id {self.id} -> {self.local}')
self.local.data = self.data # note that this should trigger storage to .ops/objects
if size_not_ok:
log.warning(f'File is over the size limit {meta.size.mb} > {size_limit_mb}')
def move(self, *, remote=None, target=None, meta=None):
""" instantiate a new cache and cleanup self because we are moving """
# FIXME what to do if we have data
if remote is None and (target is None or meta is None):
raise TypeError('either remote or meta and target are required arguments')
# deal with moving to a different directory that might not even exist yet
if target is None:
if not isinstance(self.anchor, self.__class__):
raise TypeError(f'mismatched anchor types {self!r} {self.anchor!r}')
target = self.anchor / remote # FIXME why does this not try to instantiate the caches? or does it?
if target.absolute() == self.absolute():
log.warning(f'trying to move a file onto itself {self.absolute()}')
return target
common = self.commonpath(target).absolute()
target_parent = target.parent.absolute()
parent = self.parent.absolute()
assert target.name != self.name or target_parent != parent
if target_parent != parent:
_id = remote.id if remote else meta.id
log.warning('A parent of current file has changed location!\n'
f'{common}\n{self.relative_to(common)}\n'
f'{target.relative_to(common)}\n{_id}')
if not target_parent.exists():
if remote is None: # we have to have a remote to pull parent structure
remote = self._remote_class(meta)
target_parent.mkdir_cache(remote)
do_cast = not isinstance(target, self.__class__)
if do_cast:
target = self.__class__(target, meta=meta)
if target.exists() or target.is_broken_symlink():
if target.id == self.id: #(remote.id if remote else meta.id):
if self.is_broken_symlink():
# we may be a package with extra metadata that needs to
# be merged with the target before we are unlinked
file_is_different = target._meta_updater(self.meta)
# FIXME ... if file is different then this causes staleness
# and we need to fetch
if file_is_different:
log.critical('DO SOMETHING ABOUT THIS STALE DATA'
f'\n{target}\n{target.meta.as_pretty()}')
elif do_cast:
# the target meta was just put there, if the ids match it should be ok
# however since arbitrary meta can be passed in, best to double check
file_is_different = target._meta_updater(self.meta)
if file_is_different:
log.critical('Something has gone wrong'
f'\n{target}\n{target.meta.as_pretty()}')
else:
# directory moves that are resolved during pull
log.warning(f'what is this!?\n{target}\n{self}')
elif target.is_broken_symlink():
remote._cache = self # restore the mapping for remote -> self
raise exc.WhyDidntThisGetMovedBeforeError(f'\n{target}\n{self}')
else:
raise exc.PathExistsError(f'Target {target} already exists!')
if self.exists():
safe_unlink = target.local.parent / f'.unlink-{target.name}'
try:
if target.is_broken_symlink():
target.rename(safe_unlink)
self.rename(target) # if target is_dir then this will fail, which is ok
except BaseException as e:
log.exception(e)
if safe_unlink.is_broken_symlink():
safe_unlink.rename(target)
finally:
if safe_unlink.is_broken_symlink():
safe_unlink.unlink()
elif self.is_broken_symlink():
# we don't move to trash here because this was just a file rename
self.unlink() # don't move the meta since it will break the naming insurance measure
return target
def __repr__(self):
local = repr(self.local) if self.local else 'No local??' + str(self)
remote = (f'{self.remote.__class__.__name__}({self.id!r})'
if self.remote else str(self.id))
return self.__class__.__name__ + ' <' + local + ' -> ' + remote + '>'
@property
def data(self):
raise NotImplementedError('implement in subclass')
CachePath._bind_flavours()
class ReflectiveCache(CachePath):
""" Oh, it's me. """
@property
def meta(self):
return self.local.meta
ReflectiveCache._bind_flavours()
class EatCache(EatPath, CachePath):
xattr_prefix = None
@property
def meta(self):
if self.exists():
xattrs = self.xattrs()
pathmeta = PathMeta.from_xattrs(xattrs, self.xattr_prefix, self)
return pathmeta
def _meta_setter(self, pathmeta, memory_only=False):
#log.warning(f'!!!!!!!!!!!!!!!!!!!!!!!!2 {self}')
# TODO cooperatively setting multiple different cache types?
# do we need to use super() or something?
if self.exists():
if self.is_symlink():
raise TypeError('will not write meta on symlinks! {self}')
# FIXME FIXME FIXME this needs to be written to absolutely
# prevent the writing of new metadata onto an existing file
# where the checksum differs, the old version needs to be
# trashed before any of this is written, otherwise the old
# metadata is lost >_<
self.setxattrs(pathmeta.as_xattrs(self.xattr_prefix))
if hasattr(self, '_meta'): # prevent reading from in-memory store
delattr(self, '_meta')
else:
# the glories of the inconsistencies and irreglarities of python
# you can't setattr using super() so yes you _do_ actually have to
# implement a setter sometimes >_<
super()._meta_setter(pathmeta, memory_only=memory_only)
EatCache._bind_flavours()
class SqliteCache(CachePath):
""" a persistent store to back up the xattrs if they get wiped """
def __init__(self, *args, meta=None, **kwargs):
if meta is not None:
self.meta = meta
@property
def meta(self):
if hasattr(self, '_meta'):
return self._meta
#log.error('SqliteCache getter not implemented yet.')
@meta.setter
def meta(self, value):
""" set meta """
#log.error('SqliteCache setter not implemented yet. Should probably be done in bulk anyway ...')
SqliteCache._bind_flavours()
class SymlinkCache(CachePath):
def __init__(self, *args, meta=None, **kwargs):
if meta is not None:
self.meta = meta
@property
def meta(self):
if hasattr(self, '_meta'):
return self._meta
if self.is_symlink():
if not self.exists(): # if a symlink exists it is something other than what we want
#assert pathlib.PurePosixPath(self.name) == self.readlink().parent.parent
return PathMeta.from_symlink(self)
else:
raise exc.PathExistsError(f'Target of symlink exists!\n{self} -> {self.resolve()}')
else:
return super().meta
@meta.setter