Skip to content

Commit b12a6b0

Browse files
authored
CA-399757: Add CAS style check for SR scan (#6113)
This is to reduce the chance of race between `SR.scan` and `VDI.db_introduce`. More details in the comment, but this won't fix the problem, but just makes it less likely to happen... I have seen a few instances of this happening now in SXM tests, so need to prioritise fixing this Trying to run some tests on this, but it's hard to know effective this is as storage migration tends to require lots of resources, and does not get run very often. Plus this issue is not very reproducible.
2 parents b782202 + 87ca90e commit b12a6b0

File tree

1 file changed

+44
-19
lines changed

1 file changed

+44
-19
lines changed

ocaml/xapi/xapi_sr.ml

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -786,26 +786,51 @@ let scan ~__context ~sr =
786786
SRScanThrottle.execute (fun () ->
787787
transform_storage_exn (fun () ->
788788
let sr_uuid = Db.SR.get_uuid ~__context ~self:sr in
789-
let vs, sr_info =
790-
C.SR.scan2 (Ref.string_of task)
791-
(Storage_interface.Sr.of_string sr_uuid)
792-
in
793-
let db_vdis =
794-
Db.VDI.get_records_where ~__context
795-
~expr:(Eq (Field "SR", Literal sr'))
796-
in
797-
update_vdis ~__context ~sr db_vdis vs ;
798-
let virtual_allocation =
799-
List.fold_left Int64.add 0L
800-
(List.map (fun v -> v.Storage_interface.virtual_size) vs)
789+
(* CA-399757: Do not update_vdis unless we are sure that the db was not
790+
changed during the scan. If it was, retry the scan operation. This
791+
change might be a result of the SMAPIv1 call back into xapi with
792+
the db_introduce call, for example.
793+
794+
Note this still suffers TOCTOU problem, but a complete operation is not easily
795+
implementable without rearchitecting the storage apis *)
796+
let rec scan_rec limit =
797+
let find_vdis () =
798+
Db.VDI.get_records_where ~__context
799+
~expr:(Eq (Field "SR", Literal sr'))
800+
in
801+
let db_vdis_before = find_vdis () in
802+
let vs, sr_info =
803+
C.SR.scan2 (Ref.string_of task)
804+
(Storage_interface.Sr.of_string sr_uuid)
805+
in
806+
let db_vdis_after = find_vdis () in
807+
if limit > 0 && db_vdis_after <> db_vdis_before then
808+
(scan_rec [@tailcall]) (limit - 1)
809+
else if limit = 0 then
810+
raise
811+
(Api_errors.Server_error
812+
(Api_errors.internal_error, ["SR.scan retry limit exceeded"])
813+
)
814+
else (
815+
update_vdis ~__context ~sr db_vdis_after vs ;
816+
let virtual_allocation =
817+
List.fold_left
818+
(fun acc v -> Int64.add v.Storage_interface.virtual_size acc)
819+
0L vs
820+
in
821+
Db.SR.set_virtual_allocation ~__context ~self:sr
822+
~value:virtual_allocation ;
823+
Db.SR.set_physical_size ~__context ~self:sr
824+
~value:sr_info.total_space ;
825+
Db.SR.set_physical_utilisation ~__context ~self:sr
826+
~value:(Int64.sub sr_info.total_space sr_info.free_space) ;
827+
Db.SR.remove_from_other_config ~__context ~self:sr ~key:"dirty" ;
828+
Db.SR.set_clustered ~__context ~self:sr ~value:sr_info.clustered
829+
)
801830
in
802-
Db.SR.set_virtual_allocation ~__context ~self:sr
803-
~value:virtual_allocation ;
804-
Db.SR.set_physical_size ~__context ~self:sr ~value:sr_info.total_space ;
805-
Db.SR.set_physical_utilisation ~__context ~self:sr
806-
~value:(Int64.sub sr_info.total_space sr_info.free_space) ;
807-
Db.SR.remove_from_other_config ~__context ~self:sr ~key:"dirty" ;
808-
Db.SR.set_clustered ~__context ~self:sr ~value:sr_info.clustered
831+
(* XXX Retry 10 times, and then give up. We should really expect to
832+
reach this retry limit though, unless something really bad has happened.*)
833+
scan_rec 10
809834
)
810835
)
811836

0 commit comments

Comments
 (0)