diff --git a/.github/fastfreeze-logo.png b/.github/fastfreeze-logo.png new file mode 100644 index 0000000..027106c Binary files /dev/null and b/.github/fastfreeze-logo.png differ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..07fd5ba --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,25 @@ +name: Build + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Build fastfreeze.tar.xz + run: scripts/build.sh + - name: Run tests + run: scripts/run_tests.sh + - name: Upload fastfreeze.tar.xz + uses: actions/upload-artifact@v2 + with: + name: fastfreeze.tar.xz + path: | + fastfreeze.tar.xz diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9b3453 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target/ +dist/ +*.tar.* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a975ad5 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,15 @@ +[submodule "deps/criu-image-streamer"] + path = deps/criu-image-streamer + url = https://github.com/twosigma/criu-image-streamer.git +[submodule "deps/libvirttime"] + path = deps/libvirttime + url = https://github.com/twosigma/libvirttime.git +[submodule "deps/libvirtcpuid"] + path = deps/libvirtcpuid + url = https://github.com/twosigma/libvirtcpuid.git +[submodule "deps/criu"] + path = deps/criu + url = https://github.com/twosigma/criu +[submodule "deps/set_ns_last_pid"] + path = deps/set_ns_last_pid + url = https://github.com/twosigma/set_ns_last_pid.git diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e599032 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,503 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "anyhow" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9a60d744a80c30fcb657dfe2c1b22bcb3e814c1a1e3674f32bf5820b570fbff" + +[[package]] +name = "arc-swap" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b585a98a234c46fc563103e9278c9391fde1f4e6850334da895d27edb9580f62" + +[[package]] +name = "autocfg" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "cc" +version = "1.0.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d87b23d6a92cd03af510a5ade527033f6aa6fa92161e2d5863a907d4c5e31d" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "chrono" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80094f509cf8b5ae86a4966a39b3ff66cd7e2a3e594accec3743ff3fabeab5b2" +dependencies = [ + "num-integer", + "num-traits", + "time", +] + +[[package]] +name = "clap" +version = "2.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "fastfreeze" +version = "1.0.0" +dependencies = [ + "anyhow", + "chrono", + "hostname", + "lazy_static", + "libc", + "log", + "nix", + "rand", + "serde", + "serde_json", + "signal-hook", + "structopt", + "url", +] + +[[package]] +name = "getrandom" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hostname" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" +dependencies = [ + "libc", + "match_cfg", + "winapi", +] + +[[package]] +name = "idna" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "itoa" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e85c08494b21a9054e7fe1374a732aeadaff3980b6990b94bfd3a70f690005" + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "match_cfg" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" + +[[package]] +name = "matches" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" + +[[package]] +name = "nix" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50e4785f2c3b7589a0d0c1dd60285e1188adac4006e8abd6dd578e1567027363" +dependencies = [ + "bitflags", + "cc", + "cfg-if", + "libc", + "void", +] + +[[package]] +name = "num-integer" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6ea62e9d81a77cd3ee9a2a5b9b609447857f3d358704331e4ef39eb247fcba" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096" +dependencies = [ + "autocfg", +] + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "ppv-lite86" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b" + +[[package]] +name = "proc-macro-error" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "syn-mid", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8872cf6f48eee44265156c111456a700ab3483686b3f96df4cf5481c89157319" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1f4b0efa5fc5e8ceb705136bfee52cfdb6a4e3509f770b478cd6ed434232a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "ryu" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1" + +[[package]] +name = "serde" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df6ac6412072f67cf767ebbde4133a5b2e88e76dc6187fa7104cd16f783399" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e549e3abf4fb8621bd1609f11dfc9f5e50320802273b12f3811a67e6716ea6c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7894c8ed05b7a3a279aeb79025fdec1d3158080b75b98a08faf2806bb799edd" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "signal-hook" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c0893246f276ba1aac4983fb8711dad108e2886fd76bf618a382ab4e30e5bec" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f478ede9f64724c5d173d7bb56099ec3e2d9fc2774aac65d34b8b890405f41" +dependencies = [ + "arc-swap", + "libc", +] + +[[package]] +name = "smallvec" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" + +[[package]] +name = "structopt" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410a7488c0a728c7ceb4ad59b9567eb4053d02e8cc7f5c0e0eeeb39518369213" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "syn-mid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +dependencies = [ + "matches", +] + +[[package]] +name = "unicode-normalization" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5479532badd04e128284890390c1e876ef7a993d0570b3597ae43dfa1d59afa4" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" + +[[package]] +name = "unicode-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" + +[[package]] +name = "unicode-xid" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" + +[[package]] +name = "url" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d4a8476c35c9bf0bbce5a3b23f4106f79728039b726d292bb93bc106787cb" +dependencies = [ + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "version_check" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "078775d0255232fb988e6fccf26ddc9d1ac274299aaedcedce21c6f72cc533ce" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "winapi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..26ff669 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "fastfreeze" +version = "1.0.0" +authors = ["Nicolas Viennot "] +edition = "2018" +description = "Turn-key solution for checkpoint/restore" +license = "Apache-2.0" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +structopt = { version = "0.3", default-features = false } +anyhow = "1.0" +log = { version = "0.4", features = ["std"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +libc = "0.2" +nix = "0.17" +signal-hook = "0.1" +lazy_static = "1.4" +rand = "0.7" +url = "2.1" +chrono = "0.4" +hostname = "0.3" + +[profile.release] +lto = true +codegen-units = 1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bad1cff --- /dev/null +++ b/Makefile @@ -0,0 +1,113 @@ +# Copyright 2020 Two Sigma Investments, LP. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#BUILD := debug +BUILD := release + +BUILD_FLAGS := +ifeq ($(BUILD),release) + BUILD_FLAGS += --release +endif + +CARGO := $(HOME)/.cargo/bin/cargo +ifeq (,$(wildcard $(CARGO))) + CARGO := cargo +endif + +SRCS := $(wildcard src/*.rs) Cargo.toml + +all: fastfreeze.tar.xz + +deps/%: + $(MAKE) -C deps + +DIST_DIR = dist +DIST_LIB_DIR = $(DIST_DIR)/lib + +$(DIST_DIR): + mkdir -p $@ + +$(DIST_LIB_DIR): + mkdir -p $@ + +DIST_BINS := \ + deps/criu/criu/criu \ + deps/criu-image-streamer/criu-image-streamer \ + deps/set_ns_last_pid/set_ns_last_pid \ + target/$(BUILD)/fastfreeze \ + $(shell which pv) \ + $(shell which lz4) \ + $(shell which zstd) \ + +DIST_LIBS := \ + deps/libvirtcpuid/ld-virtcpuid.so \ + deps/libvirtcpuid/libvirtcpuid.so \ + deps/libvirttime/libvirttime.so \ + +DIST_MISC := fastfreeze_wrapper.sh \ + +define add_dist_file +$(eval SRC_FILE := $(1)) +$(eval DST_DIR := $(2)) +$(eval DST_FILE := $(DST_DIR)/$(notdir $(SRC_FILE))) + +DIST_FILES += $(DST_FILE) +$(DST_FILE): $(SRC_FILE) | $(DST_DIR) + cp -aL $$< $$@ +endef + +$(foreach path,$(DIST_BINS),$(eval \ + $(call add_dist_file,$(path),$(DIST_DIR)) \ + $(eval DIST_ELF_FILES += $(DST_FILE)) \ +)) + +$(foreach path,$(DIST_LIBS),$(eval \ + $(call add_dist_file,$(path),$(DIST_LIB_DIR)) \ + $(eval DIST_ELF_FILES += $(DST_FILE)) \ +)) + +$(foreach path,$(DIST_MISC),$(eval \ + $(call add_dist_file,$(path),$(DIST_DIR)) \ +)) + +target/$(BUILD)/fastfreeze: $(SRCS) + $(CARGO) build $(BUILD_FLAGS) + +.PHONY: test clean extract-libs + +clean: + rm -rf target $(DIST_DIR) + @echo Dependencies are not cleaned. You may do so with: make -C deps clean + +extract-libs: $(DIST_ELF_FILES) | $(DIST_LIB_DIR) + ldd $(DIST_ELF_FILES) | sed 's/.*=> \(.*\) .*/\1/;t;d' | \ + sort -u | \ + xargs realpath -s | \ + grep -v $(DIST_LIB_DIR)/ | \ + xargs -I'{}' cp -L '{}' $(DIST_LIB_DIR)/ + for file in $$(echo $(DIST_ELF_FILES) $(DIST_LIB_DIR)/* | \ + tr " " "\n" | sort -u | grep -v 'ld-.*.so'); do \ + RPATH=`echo $$file | sed -E 's|^$(DIST_DIR)|| ; s|[^/]+/|../|g ; s|[^/]+$$|lib| ; s|^|$$ORIGIN|'`; \ + : " \ + We are are doing setcap on criu's binary, and that makes it \ + secure. Some distro don't interpret $ORIGIN on secure binaries \ + So we'll hard code /opt/fastfreeze/lib as RPATH \ + "; \ + RPATH=/opt/fastfreeze/lib; \ + echo "Patching rpath=$$RPATH of $$file"; \ + patchelf --set-rpath $$RPATH $$file ;\ + done + +fastfreeze.tar.xz: $(DIST_FILES) extract-libs Makefile + tar --transform 's|^$(DIST_DIR)|fastfreeze|' -cJf $@ $(DIST_DIR) diff --git a/README.md b/README.md new file mode 100644 index 0000000..817b79e --- /dev/null +++ b/README.md @@ -0,0 +1,440 @@ +![Build](https://github.com/twosigma/fastfreeze/workflows/Build/badge.svg) + +

+ +

+ +## Introduction + +_FastFreeze_ enables checkpoint/restore for applications running in Linux +containers. It uploads/downloads checkpoint images to AWS S3 and Google Storage, +provides a friendly CLI to job systems, and does not require elevated privileges +(such as `CAP_SYS_ADMIN`). + +The primary use-case of FastFreeze is to make long running and resource +intensive applications resilient to failure. This is useful for a variety of +reasons such as reducing compute waste, or lowering application completion time. +This makes Google's preemptible VM and Amazon Spot VM offerings more attractive. +We are exploring other use-cases such as JVM memory ballooning, warm-boots, and +jupyter integration. + +FastFreeze is powered by the [CRIU](https://criu.org/) engine. + +### Usage in a nutshell + +1. **Start** the application via FastFreeze with the `run` command in an empty + Linux container (e.g., Kuebrnetes, Docker). + + ``` + fastfreeze run --image-url s3://fastfreeze-images/job-1234.ff -- app.sh [args...] + ``` + +2. **Checkpoint** the application with the `checkpoint` command. + This persists the state of the application into the AWS S3 location we + provided at step 1. The application is terminated upon successful checkpoint. + + ``` + fastfreeze checkpoint + ``` + +3. **Restore** the application by running is the same command as step 1, + possibly on another machine. The `run` command checks if the image is + present. If so, it restores the application. If not, it runs the application + from scratch. This makes FastFreeze ideal to integrate with existing job + systems that retry commands until the job succeeds. + + + ``` + # same as step 1 + ``` + +## Features + +FastFreeze includes the following high-level features: + +* **Unprivileged**: FastFreeze does not need privileges like `CAP_SYS_ADMIN` to + operate. We use a modified version of CRIU to accomplish this. In addition, we + use [set_ns_last_pid](https://github.com/twosigma/set_ns_last_pid) to control + PIDs by cycling through PIDs at a rate of 100,000/s by essentially doing a + fork bomb, until we reach the PID that we desire. + +* **Fast**: FastFreeze uses + [criu-image-streamer](https://github.com/checkpoint-restore/criu-image-streamer) + to perform fast checkpointings at speed of up to 15GB/s, given enough CPU and + network bandwidth. This makes Google's preemptible VM and Amazon Spot VM + offerings more attractive. FastFreeze can checkpoint and evacuate large + applications (e.g., using 30GB of memory) within the tight eviction deadlines + (~30secs). + +* **Low overhead**: FastFreeze needs less than 100MB of memory to perform a + checkpoint or a restore. This memory headroom must be reserved in the + container in addition to what the application uses. Note that the standard S3 + and GCS uploaders (`aws s3` and `gsutil`) tend to use a lot of memory (500MB) + due to the fact that they are written in Python and use large buffers. In the + future, we plan to open-source our custom uploaders that can be used with + FastFreeze. + +* **Compression**: Checkpoint images can be compressed on the fly with lz4 or + zstd. Setting the `--cpu-budget` option when checkpointing provides ways to + control the compression algorithm. Compression is parallelized for optimal + performance. In the future, we plan to add encryption to images. + +* **CPUID virtualization**: FastFreeze enables CPU virtualization with + [libvirtcpuid](https://github.com/twosigma/libvirtcpuid). This enables the + migration of applications within a heterogeneous datacenter. For example, + starting an application on a machine that supports transactional memory can + be migrated to a host that does not. + +* **Time virtualization**: FastFreeze implements time virtualization in + userspace to offset the `CLOCK_MONOTONIC` when migrating to other machines + with [libvirttime](https://github.com/twosigma/libvirttime). + This feature is crucial for Java programs. Note there is a time namespace + available in the kernel, but FastFreeze does not use it as it requires + `CAP_SYS_ADMIN`. + +* **File system**: FastFreeze checkpoints and restore the files used by the + application such as logs, and other temporary files. These files are not + automatically detected, but rather, the user must specify the paths (files or + directories) that must be preserved via the `--preserve-path` option. + +* **Metrics**: FastFreeze can be configured to emit metrics to an external + service to collect checkpoint/restore stats. This is helpful to track the SLA + of FastFreeze. + +### Non-root limitations + +FastFreeze does not use privileged operations. This creates the following drawbacks: + +* FastFreeze must run within a Linux container (e.g., Kubernetes, Docker). This + guarantees that there are no PID conflicts. The container image must remain + unchanged when migrating an application to a different container. + +* The network connections are dropped upon restore. We rely on the application + to be tolerant to network failures and reconnect to needed services. + +* The `/proc/self/exe` symlink is not restored and will point to the criu binary. + When using gdb to attach to a restored program, one must pass the real + executable path to gdb as such: `gdb -p PID /path/to/exe`. + +* Controlling PIDs without `CAP_SYS_ADMIN` can be slow if + `/proc/sys/kernel/pid_max` is high. We recommend setting a value lower than + 100,000. + +* Memory mapped files that have been deleted are not supported. + +* As FastFreeze assumes operating within a Linux container, it does not + checkpoint/restore cgroups, seccomp, and user capabilities. We also do not + support System V IPC. + Create an [issue](https://github.com/twosigma/fastfreeze/issues/new) if you + need IPC support. + +### Supported Applications + +FastFreeze supports most Linux applications, with some restrictions: + +* GPUs and external devices are not supported. + +* Applications that rely on host-dependent environment variables (like hostname, + or job id) may have issues when migrated to a new host. Avoid relying on such + variables, or caching host-dependent information. + +* Applications that use ptrace cannot be checkpointed (e.g., running under + `strace`). + +* Due to CPUID virtualization, Only x86 64-bits applications running with GNU + libc are supported. In practice, that means no musl libc, so no alpine docker. + +* Secure binaries are not supported. For example, an application that runs a + script with `sudo` is a problem. + +* On some systems, apparmor can prevent the execution of certain application + such as `man` because we relocate the system ld.so at `/var/fastfreeze/run` + which may not be in the white-listed path of executable mmap files. This is + not an issue in practice. + +* FastFreeze only supports a single application execution at a time within a + container. An application can nevertheless be comprised of many processes and + threads. To run two instances of FastFreeze, one must use two separate + containers. + +### Non-features + +* Checkpoint images are not managed by FastFreeze. Pruning old images is not in + the scope of FastFreeze. + +## Usage + +### Installation + +FastFreeze is distributed in a self-contained 4MB package that needs to be +extracted in `/opt/fastfreeze`. + +The following shows an example of the installation of FastFreeze in a Debian +Docker image. + +```dockerfile +FROM debian:9 + +RUN apt-get update +RUN apt-get install -y curl xz-utils libcap2-bin + +RUN set -ex; \ + curl -SL https://github.com/twosigma/fastfreeze/releases/download/v1.0.0/fastfreeze-1.0.0.tar.xz | \ + tar xJf - -C /opt; \ + ln -s /opt/fastfreeze/fastfreeze_wrapper.sh /usr/local/bin/fastfreeze; \ + fastfreeze install; \ + setcap cap_sys_ptrace+eip /opt/fastfreeze/criu +``` + +The `install` command overrides the system loader `/lib64/ld-linux-x86-64.so.2`, +and creates `/var/fastfreeze` where files such as logs are kept. Note that +replacing the system loader is useful even when not doing CPUID virtualization. +It facilitates the injection of the time virtualiation library into all processes. + +The `setcap` command adds the `CAP_SYS_PTRACE` capability to CRIU. +This may or may not be needed depending on the yama configuration +`/proc/sys/kernel/yama/ptrace_scope` (see `man ptrace(2)`), or if Kubernetes is +configured with `CAP_SYS_PTRACE` as ambiant capability. + +### Tutorial + +You may try out FastFreeze with the following: + +```bash +# First, save the previously suggested Dockerfile from the Installation section +# in the current directory +$ cat > Dockerfile + +# Then, build the docker image +$ docker build . -t fastfreeze + +# 1) Run the application for the first time +$ docker run \ + --rm -it \ + --user nobody \ + --cap-add=cap_sys_ptrace \ + --name ff \ + --mount type=bind,source=/tmp,target=/tmp \ + fastfreeze:latest \ + fastfreeze run --image-url file:/tmp/ff-test -- \ + bash -c 'for i in $(seq 100); do echo $i; sleep 1; done' + +# The application is running. We should see on the terminal: +# [ff.run] (0.001s) Time is Sat, 15 Aug 2020 05:21:41 +0000 +# [ff.run] (0.001s) Host is 44f6ce3d5b4a +# [ff.run] (0.001s) Invocation ID is Jg9qyV +# [ff.run] (0.012s) Fetching image manifest for file:/tmp/ff-test +# [ff.run] (0.014s) Image manifest not found, running application from scratch +# [ff.run] (0.030s) Application is ready, started from scratch +# 1 +# 2 +# 3 +# 4 + +# 2) In another terminal, we invoke the checkpoint command +$ docker exec ff fastfreeze checkpoint + +# We should see: +# [ff.checkpoint] (0.000s) Time is Sat, 15 Aug 2020 05:21:54 +0000 +# [ff.checkpoint] (0.000s) Host is 44f6ce3d5b4a +# [ff.checkpoint] (0.000s) Invocation ID is aaNN7y +# [ff.checkpoint] (0.000s) Checkpointing application to file:/tmp/ff-test (num_shards=4 compressor=Lz4 prefix=aaNN7y) +# tar: Removing leading `/' from member names +# [ff.checkpoint] (0.014s) Uncompressed image size is 1 MiB, rate: 132 MiB/s +# [ff.checkpoint] (0.017s) Checkpoint to file:/tmp/ff-test complete. Took 0.0s + +# The first terminal should show: +# [ff.run] (13.012s) Exiting with exit_code=137: Application caught fatal signal SIGKILL +# +# The application is now checkpointed. We can inspect the image in /tmp/ff-test +# We see that the image is split into 4 different pieces. This split is +# used to parallelize checkpointing, improving performance. +$ ls -lh /tmp/ff-test + +# total 116K +# -rw-r--r-- 1 nobody nogroup 22K Aug 15 05:21 aaNN7y-1.ffs +# -rw-r--r-- 1 nobody nogroup 19K Aug 15 05:21 aaNN7y-2.ffs +# -rw-r--r-- 1 nobody nogroup 42K Aug 15 05:21 aaNN7y-3.ffs +# -rw-r--r-- 1 nobody nogroup 23K Aug 15 05:21 aaNN7y-4.ffs +# -rw-r--r-- 1 nobody nogroup 82 Aug 15 05:21 manifest.json + +# 3) We restore the application by running the same command as in 1) +$ docker run \ + --rm -it \ + --user nobody \ + --cap-add=cap_sys_ptrace \ + --name ff \ + --mount type=bind,source=/tmp,target=/tmp \ + fastfreeze:latest \ + fastfreeze run --image-url file:/tmp/ff-test -- \ + bash -c 'for i in $(seq 100); do echo $i; sleep 1; done' + +# We see in the terminal; +# [ff.run] (0.000s) Time is Sat, 15 Aug 2020 05:29:53 +0000 +# [ff.run] (0.000s) Host is 4259e670e092 +# [ff.run] (0.000s) Invocation ID is V0qRYI +# [ff.run] (0.015s) Fetching image manifest for file:/tmp/ff-test +# [ff.run] (0.017s) Restoring application +# [ff.run] (0.126s) Uncompressed image size is 1 MiB, rate: 134 MiB/s +# [ff.run] (0.157s) Application is ready, restore took 0.2s +# 5 +# 6 +# 7 +# 8 +``` + +In this example, we used the local file system to store the checkpoint image, +but in practice one would use something like AWS S3, or GCS. + +## Detailed Usage + +Below is shown a synopsis of the FastFreeze available commands. + +``` +USAGE: + fastfreeze + +SUBCOMMANDS: + run Run application. If a checkpoint image exists, the application is + restored. Otherwise, the application is run from scratch + checkpoint Perform a checkpoint of the running application + extract Extract a FastFreeze image to local disk + wait Wait for checkpoint or restore to finish + install Install FastFreeze in the specified directory +``` + +### run + +Run application. If a checkpoint image exists, the application is restored. +Otherwise, the application is run from scratch + +``` +USAGE: + fastfreeze run [OPTIONS] --image-url [--] [app-args]... + +OPTIONS: + --image-url Image URL. S3, GCS and local filesystem are supported: + * s3://bucket_name/image_path + * gs://bucket_name/image_path + * file:image_path + --on-app-ready Shell command to run once the application is running + --preserve-path ... Dir/file to include in the checkpoint image. + May be specified multiple times. + Multiple paths can also be specified colon separated + --no-restore Always run the app from scratch. Useful to ignore a faulty image + --allow-bad-image-version Allow restoring of images that don't match the version we expect + --leave-stopped Leave application stopped after restore, useful for debugging. + Has no effect when running the app from scratch + -v, --verbose Verbosity. Can be repeated + +ARGS: + ... Application arguments, used when running the app from scratch. Ignored during restore + +ENVS: + FF_APP_PATH The PATH to use for the application + FF_APP_LD_LIBRARY_PATH The LD_LIBRARY_PATH to use for the application + FF_APP_VIRT_CPUID_MASK The CPUID mask to use. See libvirtcpuid documentation for more details + FF_APP_INJECT_ Additional environment variables to inject to the application and its children. + For example, FF_APP_INJECT_LD_PRELOAD=/opt/lib/libx.so + FF_METRICS_RECORDER When specified, FastFreeze invokes the specified program to report metrics. + The metrics are formatted in JSON and passed as first argument + CRIU_OPTS Additional arguments to pass to CRIU, whitespace separated + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage3. Defaults to 'gcs_streamer' + +EXIT CODES: + 171 A failure happened during restore, or while fetching the image manifest. + Retrying with --no-restore will avoid that failure + 170 A failure happened before the application was ready + 128+sig_nr The application caught a fatal signal corresponding to `sig_nr` + exit_code The application exited with `exit_code` +``` + + +### checkpoint + +Perform a checkpoint of the running application + +``` +USAGE: + fastfreeze checkpoint [OPTIONS] + +OPTIONS: + --leave-running Leave application running after checkpoint + --image-url Image URL, defaults to the value used during the run command + --preserve-path ... Dir/file to include in the image in addition to the ones specified during the + run command. May be specified multiple times. Multiple paths can also be specified + colon separated + --num-shards Level of parallelism. Split the image in multiple shards [default: 4] + --cpu-budget Amount of CPU at disposal. Possible values are [low, medium, high]. Currently, + `low` skips compression, `medium` uses lz4, and high uses zstd [default: medium] + -v, --verbose Verbosity. Can be repeated + +ENVS: + FF_METRICS_RECORDER When specified, FastFreeze invokes the specified program to report metrics. + The metrics are formatted in JSON and passed as first argument + CRIU_OPTS Additional arguments to pass to CRIU, whitespace separated + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage3. Defaults to 'gcs_streamer' +``` + +### extract + +Extract a FastFreeze image to local disk + +``` +USAGE: + fastfreeze extract [OPTIONS] --image-url + +OPTIONS: + -i, --image-url Image URL, which can also be a regular local path + -o, --output-dir Output directory where to extract the image. + Defaults to the last path component of image-url + --allow-bad-image-version Allow restoring of images that don't match the version we expect + -v, --verbose Verbosity. Can be repeated + +ENVS: + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage3. Defaults to 'gcs_streamer' +``` + +### wait + +Wait for checkpoint or restore to finish + +``` +USAGE: + fastfreeze wait [OPTIONS] + +OPTIONS: + -t, --timeout Fail after some specified number of seconds. Decimals are allowed + -v, --verbose Verbosity. Can be repeated +``` + + +### install + +Install FastFreeze, mostly to setup virtualization + +``` +USAGE: + fastfreeze install [OPTIONS] + +OPTIONS: + -v, --verbose Verbosity. Can be repeated +``` + + +## Acknowledgments +* Author: Nicolas Viennot [@nviennot](https://github.com/nviennot) +* Reviewer: Peter Burka [@pburka](https://github.com/pburka) +* Developed as a [Two Sigma Open Source](https://opensource.twosigma.com) initiative + +License +------- + +FastFreeze is licensed under the +[Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). diff --git a/deps/.gitignore b/deps/.gitignore new file mode 100644 index 0000000..05632f9 --- /dev/null +++ b/deps/.gitignore @@ -0,0 +1 @@ +.deps-* diff --git a/deps/Makefile b/deps/Makefile new file mode 100644 index 0000000..9fb6677 --- /dev/null +++ b/deps/Makefile @@ -0,0 +1,54 @@ +# The change of these two variables must be reflected in ../src/consts.rs +INTERPOSED_LD_PATH=/var/fastfreeze/run/ld-linux-x86-64.so.2 +LD_INJECT_ENV_PATH=/var/fastfreeze/ld-inject.env + +RUSTUP_DEP_FILE=.deps-rustup-$(shell hostname) +$(RUSTUP_DEP_FILE): + which rustc || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + touch $@ + +# debian/ubuntu dependencies +DEPS_FILE=.deps-debian-v1-$(shell hostname) +$(DEPS_FILE): + [ -e criu/Makefile ] || (git submodule sync && git submodule update --init) + sudo apt-get install -y --no-install-recommends \ + build-essential pkg-config \ + libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler \ + libbsd-dev iproute2 libcap-dev libnl-3-dev libnet-dev libaio-dev \ + patchelf pv liblz4-tool zstd jq wget + touch $@ + +BUILDS := \ + build_criu \ + build_libvirtcpuid \ + build_libvirttime \ + build_criu_image_streamer \ + build_set_ns_last_pid \ + +.PHONY: $(BUILDS) clean + +all: $(BUILDS) + +build_criu: | $(DEPS_FILE) + $(MAKE) -C criu criu + +build_libvirtcpuid: | $(DEPS_FILE) + $(MAKE) -C libvirtcpuid \ + INTERPOSED_LD_PATH=$(INTERPOSED_LD_PATH) \ + LD_INJECT_ENV_PATH=$(LD_INJECT_ENV_PATH) + +build_libvirttime: | $(DEPS_FILE) + $(MAKE) -C libvirttime + +build_criu_image_streamer: | $(DEPS_FILE) $(RUSTUP_DEP_FILE) + $(MAKE) -C criu-image-streamer + +build_set_ns_last_pid: | $(DEPS_FILE) + $(MAKE) -C set_ns_last_pid + +clean: + $(MAKE) -C criu mrproper + $(MAKE) -C libvirtcpuid clean + $(MAKE) -C libvirttime clean + $(MAKE) -C criu-image-streamer clean + $(MAKE) -C set_ns_last_pid clean diff --git a/deps/criu b/deps/criu new file mode 160000 index 0000000..8737c86 --- /dev/null +++ b/deps/criu @@ -0,0 +1 @@ +Subproject commit 8737c86f7299bb4619f1a09d75e80a0c8ebb6d2b diff --git a/deps/criu-image-streamer b/deps/criu-image-streamer new file mode 160000 index 0000000..a8fd886 --- /dev/null +++ b/deps/criu-image-streamer @@ -0,0 +1 @@ +Subproject commit a8fd886838d95756efa31161fa40248148da200c diff --git a/deps/libvirtcpuid b/deps/libvirtcpuid new file mode 160000 index 0000000..1f47db8 --- /dev/null +++ b/deps/libvirtcpuid @@ -0,0 +1 @@ +Subproject commit 1f47db86ea91903b6c018b3639233d68abdbb349 diff --git a/deps/libvirttime b/deps/libvirttime new file mode 160000 index 0000000..db8a06b --- /dev/null +++ b/deps/libvirttime @@ -0,0 +1 @@ +Subproject commit db8a06bf56cd5c821ed72a16e4ee2e57ec3cd2e8 diff --git a/deps/set_ns_last_pid b/deps/set_ns_last_pid new file mode 160000 index 0000000..80ab8d0 --- /dev/null +++ b/deps/set_ns_last_pid @@ -0,0 +1 @@ +Subproject commit 80ab8d0a5b125c246f4e6f7927adbe90d4ca8931 diff --git a/scripts/Dockerfile.build b/scripts/Dockerfile.build new file mode 100644 index 0000000..6adcd47 --- /dev/null +++ b/scripts/Dockerfile.build @@ -0,0 +1,34 @@ +FROM debian:9 + +WORKDIR /src/fastfreeze + +# Few essential things before we can get going +RUN apt-get update +RUN apt-get install -y build-essential pkg-config sudo curl git python3 + +# Build dependencies (CRIU, rust toolchain, libvirtcpuid, etc.) +COPY deps deps +RUN make -C deps clean && make -C deps -j4 +ENV CARGO=/root/.cargo/bin/cargo + +# Build FastFreeze Rust dependencies +# This enables fast image rebuild when making code modification +COPY Cargo.lock . +COPY Cargo.toml . +RUN set -ex; \ + mkdir src; \ + echo "" > src/lib.rs; \ + echo "fn main() {}" > src/main.rs; \ + $CARGO test; \ + $CARGO build --release; + +# Build FastFreeze +COPY src src +RUN touch src/lib.rs src/main.rs +RUN $CARGO test +RUN $CARGO build --release + +# Package FastFreeze +COPY Makefile . +COPY scripts/fastfreeze_wrapper.sh . +RUN make diff --git a/scripts/Dockerfile.test b/scripts/Dockerfile.test new file mode 100644 index 0000000..3efdce5 --- /dev/null +++ b/scripts/Dockerfile.test @@ -0,0 +1,16 @@ +FROM debian:9 + +RUN apt-get update +RUN apt-get install -y xz-utils libcap2-bin + +#RUN apt-get install -y strace procps elfutils gdb binutils vim patchelf + +COPY fastfreeze.tar.xz /tmp + +RUN set -ex; \ + tar xf /tmp/fastfreeze.tar.xz -C /opt; \ + ln -s /opt/fastfreeze/fastfreeze_wrapper.sh /usr/local/bin/fastfreeze; \ + fastfreeze install; \ + setcap cap_sys_ptrace+eip /opt/fastfreeze/criu + +WORKDIR /opt/fastfreeze diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 0000000..aefa20e --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex + +SCRIPT_DIR=$(dirname -- "$(readlink -f -- "$0")") +cd $SCRIPT_DIR/.. + +docker build -f scripts/Dockerfile.build . -t fastfreeze-build +docker run fastfreeze-build cat /src/fastfreeze/fastfreeze.tar.xz > ./fastfreeze.tar.xz +docker build -f scripts/Dockerfile.test . -t fastfreeze-test diff --git a/scripts/fastfreeze_wrapper.sh b/scripts/fastfreeze_wrapper.sh new file mode 100755 index 0000000..d9719ee --- /dev/null +++ b/scripts/fastfreeze_wrapper.sh @@ -0,0 +1,24 @@ +#!/bin/sh +set -e + +FF_DIR=$(dirname -- "$(readlink -f -- "$0")") + +# Pass the original PATH and LD_LIBRARY_PATH down to the application +export FF_APP_PATH=$PATH +export FF_APP_LD_LIBRARY_PATH=$LD_LIBRARY_PATH + +# Override the PATH and LD_LIBRARY_PATH that fastfreeze should use +export LD_LIBRARY_PATH=$FF_DIR/lib:$LD_LIBRARY_PATH +export PATH=$FF_DIR:$PATH + +# You may set the following environment variables +# FF_APP_VIRT_CPUID_MASK The CPUID mask to use. See libvirtcpuid documentation for more details +# FF_APP_INJECT_ Additional environment variables to inject to the application and its children. +# For example, FF_APP_INJECT_LD_PRELOAD=/opt/lib/libx.so +# FF_METRICS_RECORDER When specified, FastFreeze invokes the specified program to report metrics. +# The metrics are formatted in JSON and passed as first argument +# CRIU_OPTS Additional arguments to pass to CRIU, whitespace separated +# S3_CMD Command to access AWS S3. Defaults to 'aws s3' +# GS_CMD Command to access Google Storage. Defaults to 'gsutil' + +exec $FF_DIR/fastfreeze "$@" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100755 index 0000000..7efe4c2 --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -ex + +# Before running this, run ./build.sh +# TODO We need a bit more tests. Perhaps using something like Python would make +# sense for this. + +SCRIPT_DIR=$(dirname -- "$(readlink -f -- "$0")") +cd $SCRIPT_DIR/.. + +IMAGE_DIR=/tmp/ff-test-images +sudo rm -rf $IMAGE_DIR +mkdir -p $IMAGE_DIR +chmod 1777 $IMAGE_DIR # /tmp like permissions + +docker stop ff || true + +docker run \ + --rm \ + --user nobody \ + --cap-add=cap_sys_ptrace \ + --name ff \ + --mount type=bind,source=$IMAGE_DIR,target=/images \ + fastfreeze-test:latest \ + fastfreeze run --image-url file:/images/test-1 sleep 30d & +sleep 2 # wait for app started + +# Forget to put cap-add, and get Permission Denied + +docker exec ff fastfreeze checkpoint + +wait + +docker run \ + --rm \ + --user nobody \ + --cap-add=cap_sys_ptrace \ + --name ff \ + --mount type=bind,source=$IMAGE_DIR,target=/images \ + fastfreeze-test:latest \ + fastfreeze run --image-url file:/images/test-1 sleep 30d & +sleep 2 # wait for app started + +docker exec ff fastfreeze checkpoint + +wait diff --git a/src/cli/checkpoint.rs b/src/cli/checkpoint.rs new file mode 100644 index 0000000..113408b --- /dev/null +++ b/src/cli/checkpoint.rs @@ -0,0 +1,256 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + os::unix::io::AsRawFd, + collections::HashSet, + path::{Path, PathBuf}, + time::Duration, +}; +use nix::{ + poll::{PollFd, PollFlags}, + sys::signal::{self, killpg}, + unistd::Pid, +}; +use structopt::StructOpt; +use serde::Serialize; +use crate::{ + consts::*, + store, + image::{ImageManifest, Compressor, shard, CpuBudget}, + process::{Command, ProcessExt, ProcessGroup, Stdio}, + metrics::{with_metrics, emit_metrics}, + util::poll_nointr, + image_streamer::{Stats, ImageStreamer}, + lock::with_checkpoint_restore_lock, + criu, + filesystem, + virt, +}; +use super::run::AppConfig; + + +/// Perform a checkpoint of the running application +#[derive(StructOpt, PartialEq, Debug, Serialize)] +#[structopt(after_help("\ +ENVS: + FF_METRICS_RECORDER When specified, FastFreeze invokes the specified program to report metrics. + The metrics are formatted in JSON and passed as first argument + CRIU_OPTS Additional arguments to pass to CRIU, whitespace separated + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage. Defaults to 'gsutil'" +))] +pub struct Checkpoint { + /// Image URL, defaults to the value used during the run command + #[structopt(long)] + image_url: Option, + + /// Dir/file to include in the image in addition to the ones specified during the run command. + /// May be specified multiple times. Multiple paths can also be specified colon separated. + // require_delimiter is set to avoid clap's non-standard way of accepting lists. + #[structopt(long="preserve-path", name="path", require_delimiter=true, value_delimiter=":")] + preserved_paths: Vec, + + /// Leave application running after checkpoint + #[structopt(long)] + leave_running: bool, + + /// Level of parallelism. Split the image in multiple shards. + // We use a default of 4 shards to benefit from some parallelism. + // It should be set to something related to the number of CPUs available. + #[structopt(long, default_value="4")] + num_shards: u32, + + /// Amount of CPU at disposal. Possible values are [low, medium, high]. + /// Currently, `low` skips compression, `medium` uses lz4, and + /// high uses zstd. + #[structopt(long, default_value="medium")] + cpu_budget: CpuBudget, + + /// Verbosity. Can be repeated + #[structopt(short, long, parse(from_occurrences))] + pub verbose: u8, +} + +fn is_app_running() -> bool { + Path::new("/proc").join(APP_ROOT_PID.to_string()).exists() +} + +pub fn do_checkpoint(opts: Checkpoint) -> Result { + let Checkpoint { + image_url, num_shards, cpu_budget, + preserved_paths, leave_running, verbose: _, + } = opts; + + // We override TMPDIR with a safe location. The uploader (or metrics CLI) + // may create a tmp file (e.g., bash script using here documents). This + // would cause tar to fail as it detects changes in /tmp. + // `NO_PRESERVE_FF_DIR` is excluded from the list of paths to preserve. + std::env::set_var("TMPDIR", &*NO_PRESERVE_FF_DIR); + + let mut preserved_paths: HashSet<_> = preserved_paths.into_iter().collect(); + + let config = AppConfig::restore()?; + + // If the image_url is not supplied, we use the one that we stashed during + // the run operation. + let image_url = image_url.unwrap_or(config.image_url); + + // We emit a "checkpoint_start" event to make it easier to track down + // containers that vanish during checkpoints. We don't wait for the metrics + // process to complete, it would delay checkpointing. + let _metrics_p_reaper = { + let event = json!({"action": "checkpoint_start", "image_url": image_url}); + emit_metrics(event)?.map(|p| p.reap_on_drop()) + }; + + // As for preserved_paths, we join all the paths we know of. + // There is the downside of not being able to forget a path that was once preserved. + // The upside is that is less prone to bugs for users. + preserved_paths.extend(config.preserved_paths); + + ensure!(is_app_running(), "Application is not running"); + + // The manifest contains the name of the shards, which are generated at random. + // We combine it with the store to generate the shard upload commands. + // A shard upload command is of the form: + // "lz4 -1 - - | aws s3 cp - s3://bucket/img/XXXXXX.ffs" + let img_manifest = ImageManifest::new(num_shards, Compressor::from(cpu_budget)); + let store = store::from_url(&image_url)?; + let shard_upload_cmds = shard::upload_cmds(&img_manifest, &*store); + + info!("Checkpointing application to {} (num_shards={} compressor={:?} prefix={})", + image_url, num_shards, img_manifest.compressor, img_manifest.shard_prefix); + + // `pgrp` monitors all our child processes. If one fails, the whole group fails + let mut pgrp = ProcessGroup::new()?; + let mut img_streamer = ImageStreamer::spawn_capture(num_shards as usize)?; + img_streamer.process.join(&mut pgrp); + + // Spawn the upload processes connected to the image streamer's output + for (upload_cmd, shard_pipe) in shard_upload_cmds.into_iter().zip(img_streamer.shard_pipes) { + Command::new_shell(&upload_cmd) + .stdin(Stdio::from(shard_pipe)) + .spawn()? + .join(&mut pgrp); + } + + // Wait for the imager socket to be ready. + img_streamer.progress.wait_for_socket_init()?; + + // Spawn the CRIU dump process. CRIU sends the image to the image streamer. + // CRIU will leave the application in a stopped state when done, + // so that we can continue tarring the filesystem. + // Note: it would be tempting to SIGCONT the application upon failures, but + // we should not. It's CRIU's responsability to do so. If it didn't SIGCONT + // the app, then something bad has happened, and it would be unsafe to let + // the application run in a bad state. + criu::spawn_dump()? + .join_as_non_killable(&mut pgrp); + + // We want to start dumping the file system ASAP, but we must wait for the + // application to be stopped by CRIU, otherwise the filesystem might still + // be changing under us. We wait for the "checkpoint-start" message from the + // streamer progress pipe. + // We must also check for the CRIU process, otherwise, we could hang forever + while pgrp.try_wait_for_success()? { + let mut poll_fds = [ + PollFd::new(img_streamer.progress.fd, PollFlags::POLLIN), + PollFd::new(pgrp.sigchld_pipe.as_raw_fd(), PollFlags::POLLIN) + ]; + let timeout = -1; + poll_nointr(&mut poll_fds, timeout)?; + + // Check if we have something to read on the progress pipe. + // unwrap() is safe: we assume the kernel returns valid bits in `revents`. + if !poll_fds[0].revents().unwrap().is_empty() { + img_streamer.progress.wait_for_checkpoint_start()?; + break; + } + } + debug!("Checkpoint started, application is frozen"); + + { + // We save the current time of the application so we can resume time + // where we left off. The time config file goes on the file system. + // We also save the image_url and preserved paths. + let app_clock = virt::time::ConfigPath::default().read_current_app_clock()?; + ensure!(app_clock >= 0, "Computed app clock is negative: {}ns", app_clock); + debug!("App clock: {:.1}s", Duration::from_nanos(app_clock as u64).as_secs_f64()); + + let image_url = image_url.to_string(); + let preserved_paths = preserved_paths.clone(); + + let config = AppConfig { image_url, preserved_paths, app_clock }; + config.save()?; + } + + // We dump the filesystem with tar. The stdout of tar connects to + // criu-image-streamer, which incorporates the tarball into the checkpoint + // image. + debug!("Dumping filesystem"); + filesystem::spawn_tar(preserved_paths, img_streamer.tar_fs_pipe.unwrap())? + .wait_for_success()?; + debug!("Filesystem dumped. Finishing dumping processes"); + + // Wait for checkpoint to complete + pgrp.wait_for_success()?; + + let stats = img_streamer.progress.wait_for_stats()?; + stats.show(); + + if leave_running { + trace!("Resuming application"); + killpg(Pid::from_raw(APP_ROOT_PID), signal::SIGCONT) + .context("Failed to resume application")?; + } else { + // We kill the app later, once metrics are emitted. + } + + // At this point, all the shards are written successfully. We can now write + // the manifest file to the store. The manifest file existence indicates + // whether the image exists, so it must be written at the very end. + debug!("Writing image manifest"); + img_manifest.persist_to_store(&*store) + .with_context(|| format!("Failed to upload image manifest at {}", image_url))?; + + info!("Checkpoint to {} complete. Took {:.1}s", + image_url, START_TIME.elapsed().as_secs_f64()); + + Ok(stats) +} + +impl super::CLI for Checkpoint { + fn run(self) -> Result<()> { + // Holding the lock while invoking the metrics CLI is preferable to avoid + // disturbing another instance trying to do PID control. + with_checkpoint_restore_lock(|| { + let leave_running = self.leave_running; + with_metrics("checkpoint", + || do_checkpoint(self), + |stats| json!({"stats": stats}))?; + + // We kill the app after the metrics are emitted. Killing the app + // risk terminating the container, preventing metrics from being emitted. + if !leave_running { + debug!("Killing application"); + killpg(Pid::from_raw(APP_ROOT_PID), signal::SIGKILL) + .context("Failed to kill application")?; + } + + Ok(()) + }) + } +} diff --git a/src/cli/extract.rs b/src/cli/extract.rs new file mode 100644 index 0000000..b297055 --- /dev/null +++ b/src/cli/extract.rs @@ -0,0 +1,120 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::path::PathBuf; +use url::Url; +use structopt::StructOpt; +use serde::Serialize; +use crate::{ + consts::*, + store, + image::{ManifestFetchResult, ImageManifest, shard}, + process::{Command, ProcessExt, ProcessGroup, Stdio}, + image_streamer::ImageStreamer, +}; + +/// Extract a FastFreeze image to local disk +#[derive(StructOpt, PartialEq, Debug, Serialize)] +#[structopt(after_help("\ +ENVS: + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage. Defaults to 'gsutil'" +))] +pub struct Extract { + /// Image URL, which can also be a regular local path + #[structopt(short, long)] + image_url: String, + + /// Output directory where to extract the image. + /// Defaults to the last path component of image-url. + #[structopt(short, long)] + output_dir: Option, + + /// Allow restoring of images that don't match the version we expect. + #[structopt(long)] + allow_bad_image_version: bool, + + /// Verbosity. Can be repeated + #[structopt(short, long, parse(from_occurrences))] + pub verbose: u8, +} + +pub fn extract_image( + shard_download_cmds: Vec, + output_dir: PathBuf, +) -> Result<()> { + let num_shards = shard_download_cmds.len(); + + info!("Extracting image from {} shards", num_shards); + + let mut pgrp = ProcessGroup::new()?; + let mut img_streamer = ImageStreamer::spawn_extract(num_shards, &output_dir)?; + img_streamer.process.join(&mut pgrp); + + for (download_cmd, shard_pipe) in shard_download_cmds.into_iter().zip(img_streamer.shard_pipes) { + Command::new_shell(&download_cmd) + .stdout(Stdio::from(shard_pipe)) + .spawn()? + .join(&mut pgrp); + } + + pgrp.wait_for_success()?; + + let stats = img_streamer.progress.wait_for_stats()?; + stats.show(); + + info!("Image extracted to {}. Took {:.1}s", + output_dir.display(), START_TIME.elapsed().as_secs_f64()); + + Ok(()) +} + +impl super::CLI for Extract { + fn run(self) -> Result<()> { + let Self { image_url, output_dir, allow_bad_image_version, verbose: _ } = self; + + let output_dir = match output_dir { + Some(output_dir) => output_dir, + None => { + Url::parse(&image_url)?.path_segments() + .and_then(|paths| paths.last()) + .map(PathBuf::from) + .ok_or_else(|| anyhow!("Supply an output_dir"))? + } + }; + + let store = store::from_url(&image_url)?; + store.prepare(false)?; + + info!("Fetching image manifest for {}", image_url); + + match ImageManifest::fetch_from_store(&*store, allow_bad_image_version)? { + ManifestFetchResult::Some(img_manifest) => { + debug!("Image manifest found: {:?}", img_manifest); + extract_image(shard::download_cmds(&img_manifest, &*store), output_dir)?; + } + ManifestFetchResult::VersionMismatch { fetched, desired } => { + bail!("Image manifest found, but has version {} while the expected version is {}. \ + You may try again with --allow-bad-image-version", + fetched, desired); + } + ManifestFetchResult::NotFound => { + bail!("Image manifest not found, running app normally"); + } + } + + Ok(()) + } +} diff --git a/src/cli/install.rs b/src/cli/install.rs new file mode 100644 index 0000000..35060d0 --- /dev/null +++ b/src/cli/install.rs @@ -0,0 +1,75 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use structopt::StructOpt; +use serde::Serialize; +use crate::{ + consts::*, + util::{create_dir_all, find_lib, atomic_symlink, copy_file}, +}; + +use std::{ + fs::{self, Permissions}, + os::unix::fs::PermissionsExt, +}; + +/// Install FastFreeze, mostly to setup virtualization +#[derive(StructOpt, PartialEq, Debug, Serialize)] +pub struct Install { + /// Verbosity. Can be repeated + #[structopt(short, long, parse(from_occurrences))] + pub verbose: u8, +} + +impl super::CLI for Install { + fn run(self) -> Result<()> { + if let Err(_) = create_dir_all(&*FF_DIR) { + bail!("{} should be volume mounted. See the kubernetes yaml example. \ + It is used for interposing the system ELF loader", FF_DIR.display()); + } + create_dir_all(&*NO_PRESERVE_FF_DIR)?; + + // We give /tmp-like permissions to allow other users to write to the directory + fs::set_permissions(&*FF_DIR, Permissions::from_mode(0o1777))?; + fs::set_permissions(&*NO_PRESERVE_FF_DIR, Permissions::from_mode(0o1777))?; + + let system_ld_real_path = LD_SYSTEM_PATH.read_link() + .with_context(|| format!("Failed to read link {}", LD_SYSTEM_PATH.display()))?; + + if system_ld_real_path.to_string_lossy().contains("virtcpuid") { + warn!("Installation is already done, skipping"); + return Ok(()); + } + + // copy /lib/ld-linux.so to /var/fastfreeze/run/ld-linux.so + copy_file(system_ld_real_path, &*LD_SYSTEM_ORIG_PATH)?; + + // copy our virtualization libraries to /var/fastfreeze/run/ + for path in &[&*LD_VIRTCPUID_PATH, &*LIBVIRTCPUID_PATH, &*LIBVIRTTIME_PATH] { + copy_file(find_lib(path.file_name().unwrap())?, path)?; + } + + // symlink /var/fastfreeze/run/ld-virtcpuid.so to /lib/ld-linux.so + if let Err(_) = atomic_symlink(&*LD_VIRTCPUID_PATH, &*LD_SYSTEM_PATH) { + warn!("Installation is complete, but a kubernetes volume mount is \ + needed to interpose the system ELF loader {}. \ + See the kubernetes yaml example for details on how to do so", LD_SYSTEM_PATH.display()); + } else { + info!("Installation is complete"); + } + + Ok(()) + } +} diff --git a/src/cli/main.rs b/src/cli/main.rs new file mode 100644 index 0000000..a23d83e --- /dev/null +++ b/src/cli/main.rs @@ -0,0 +1,110 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use structopt::{StructOpt, clap::AppSettings}; +use serde::Serialize; +use crate::logger; +use super::{ + CLI, + checkpoint::Checkpoint, + extract::Extract, + install::Install, + run::Run, + wait::Wait, +}; + +#[derive(StructOpt, PartialEq, Debug, Serialize)] +#[structopt( + // When showing --help, we want to keep the order of arguments as we defined, + // as opposed to the default alphabetical order. + global_setting(AppSettings::DeriveDisplayOrder), + // help subcommand is not useful, disable it. + global_setting(AppSettings::DisableHelpSubcommand), + // subcommand version is not useful, disable it. + global_setting(AppSettings::VersionlessSubcommands), +)] +pub struct Opts { + #[structopt(subcommand)] + command: Command, +} + +#[derive(StructOpt, PartialEq, Debug, Serialize)] +enum Command { + Run(Run), + Checkpoint(Checkpoint), + Extract(Extract), + Wait(Wait), + Install(Install), +} + +impl Opts { + // It looks a bit silly not to have a global verbose option flag, but if we + // use a global flag, then the user _must_ pass --verbose before the + // subcommand, which is even more silly. + // clap should be better + fn verbosity(&self) -> u8 { + match self.command { + Command::Install(Install { verbose, .. }) | + Command::Run(Run { verbose, .. }) | + Command::Checkpoint(Checkpoint { verbose, .. }) | + Command::Extract(Extract { verbose, .. }) | + Command::Wait(Wait { verbose, .. }) => verbose, + } + } + + fn log_level(&self) -> logger::LevelFilter { + match self.verbosity() { + 0 => logger::LevelFilter::Info, + 1 => logger::LevelFilter::Debug, + _ => logger::LevelFilter::Trace, + } + } + + fn log_prefix(&self) -> &'static str { + match self.command { + Command::Install(_) => "install", + Command::Run(_) => "run", + Command::Checkpoint(_) => "checkpoint", + Command::Extract(_) => "extract", + Command::Wait(_) => "wait", + } + } + + fn use_log_file(&self) -> bool { + // Persisting a log file is helpful to carry the history of the + // application in the checkpointed image. + match self.command { + Command::Run(_) | + Command::Checkpoint(_) => true, + _ => false, + } + } + + pub fn init_logger(&self) { + logger::init(self.log_level(), self.log_prefix(), self.use_log_file()); + } +} + +impl CLI for Opts { + fn run(self) -> Result<()> { + match self.command { + Command::Install(opts) => opts.run(), + Command::Run(opts) => opts.run(), + Command::Checkpoint(opts) => opts.run(), + Command::Extract(opts) => opts.run(), + Command::Wait(opts) => opts.run(), + } + } +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs new file mode 100644 index 0000000..0c684bb --- /dev/null +++ b/src/cli/mod.rs @@ -0,0 +1,40 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod run; +mod checkpoint; +mod extract; +mod wait; +mod install; +mod main; + +pub trait CLI { + fn run(self) -> anyhow::Result<()>; +} + +#[derive(Debug)] +pub struct ExitCode(pub u8); +impl std::fmt::Display for ExitCode { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Exiting with exit_code={}", self.0) + } +} + +impl ExitCode { + pub fn from_error(e: &anyhow::Error) -> Option { + e.downcast_ref::().map(|exit_code| exit_code.0) + } +} + +pub use main::Opts; diff --git a/src/cli/run.rs b/src/cli/run.rs new file mode 100644 index 0000000..c806398 --- /dev/null +++ b/src/cli/run.rs @@ -0,0 +1,427 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + time::Duration, + ffi::OsString, + path::PathBuf, + fs, collections::HashSet +}; +use nix::{ + sys::signal::{self, kill, killpg, SigmaskHow, SigSet}, + sys::wait::{wait, WaitStatus}, + unistd::Pid, +}; +use structopt::StructOpt; +use serde::{Serialize, Deserialize}; +use signal::{pthread_sigmask, Signal}; +use crate::{ + consts::*, + store, + virt, + cli::ExitCode, + image::{ManifestFetchResult, ImageManifest, shard}, + process::{Command, CommandPidExt, ProcessExt, ProcessGroup, Stdio, + spawn_set_ns_last_pid_server, set_ns_last_pid, MIN_PID}, + metrics::with_metrics, + filesystem::spawn_untar, + image_streamer::{Stats, ImageStreamer}, + lock::with_checkpoint_restore_lock, + criu, +}; +use libc::c_int; +use virt::time::Nanos; + + +/// Run application. If a checkpoint image exists, the application is restored. Otherwise, the +/// application is run from scratch. +#[derive(StructOpt, PartialEq, Debug, Serialize)] +#[structopt(after_help("\ +ENVS: + FF_APP_PATH The PATH to use for the application + FF_APP_LD_LIBRARY_PATH The LD_LIBRARY_PATH to use for the application + FF_APP_VIRT_CPUID_MASK The CPUID mask to use. See libvirtcpuid documentation for more details + FF_APP_INJECT_ Additional environment variables to inject to the application and its children. + For example, FF_APP_INJECT_LD_PRELOAD=/opt/lib/libx.so + FF_METRICS_RECORDER When specified, FastFreeze invokes the specified program to report metrics. + The metrics are formatted in JSON and passed as first argument + CRIU_OPTS Additional arguments to pass to CRIU, whitespace separated + S3_CMD Command to access AWS S3. Defaults to 'aws s3' + GS_CMD Command to access Google Storage. Defaults to 'gsutil' + +EXIT CODES: + 171 A failure happened during restore, or while fetching the image manifest. + Retrying with --no-restore will avoid that failure + 170 A failure happened before the application was ready + 128+sig_nr The application caught a fatal signal corresponding to `sig_nr` + exit_code The application exited with `exit_code`" +))] +pub struct Run { + /// Image URL. S3, GCS and local filesystem are supported: {n} + /// * s3://bucket_name/image_path {n} + /// * gs://bucket_name/image_path {n} + /// * file:image_path + // {n} means new line in the CLI's --help command + #[structopt(long, name="url")] + image_url: String, + + /// Application arguments, used when running the app from scratch. + /// Ignored during restore. + // Note: Type should be OsString, but structopt doesn't like it + #[structopt()] + app_args: Vec, + + /// Shell command to run once the application is running. + // Note: Type should be OsString, but structopt doesn't like it + #[structopt(long="on-app-ready", name="cmd")] + on_app_ready_cmd: Option, + + /// Always run the app from scratch. Useful to ignore a faulty image. + #[structopt(long)] + no_restore: bool, + + /// Allow restoring of images that don't match the version we expect. + #[structopt(long)] + allow_bad_image_version: bool, + + /// Dir/file to include in the checkpoint image. + /// May be specified multiple times. Multiple paths can also be specified colon separated. + // require_delimiter is set to avoid clap's non-standard way of accepting lists. + #[structopt(long="preserve-path", name="path", require_delimiter=true, value_delimiter=":")] + preserved_paths: Vec, + + /// Leave application stopped after restore, useful for debugging. + /// Has no effect when running the app from scratch. + #[structopt(long)] + leave_stopped: bool, + + /// Verbosity. Can be repeated + #[structopt(short, long, parse(from_occurrences))] + pub verbose: u8, + + /// Used for testing, not for normal use. + /// App monitoring is skipped: FastFreeze exits as soon as the app is running + // Maybe we could explore this feature at some point instead of having the + // start hook. It might be tricky to figure out who should be the parent of + // app during restore. We could explore CLONE_PARENT. But we would need to do similar + // tricks to what CRIU does to monitor the process, which is to use ptrace. + #[structopt(long, hidden=true)] + detach: bool, +} + + +/// `AppConfig` is created during the run command, and updated during checkpoint. +/// These settings are saved under `APP_CONFIG_PATH`. +/// It's useful for the checkpoint command to know the image_url and preserved_paths. +/// During restore, it is useful to read the app_clock. + +#[derive(Serialize, Deserialize)] +pub struct AppConfig { + pub image_url: String, + pub preserved_paths: HashSet, + pub app_clock: Nanos, +} + +impl AppConfig { + pub fn save(&self) -> Result<()> { + serde_json::to_writer_pretty(fs::File::create(&*APP_CONFIG_PATH)?, &self)?; + Ok(()) + } + + pub fn restore() -> Result { + let file = fs::File::open(&*APP_CONFIG_PATH) + .with_context(|| format!("Failed to open {}. \ + It is created during the run command", APP_CONFIG_PATH.display()))?; + Ok(serde_json::from_reader(file)?) + } +} + + +fn restore( + image_url: String, + preserved_paths: HashSet, + shard_download_cmds: Vec, + leave_stopped: bool, +) -> Result { + info!("Restoring application{}", if leave_stopped { " (leave stopped)" } else { "" }); + let mut pgrp = ProcessGroup::new()?; + + let mut img_streamer = ImageStreamer::spawn_serve(shard_download_cmds.len())?; + img_streamer.process.join(&mut pgrp); + + // Spawn the download processes connected to the image streamer's input + for (download_cmd, shard_pipe) in shard_download_cmds.into_iter().zip(img_streamer.shard_pipes) { + Command::new_shell(&download_cmd) + .stdout(Stdio::from(shard_pipe)) + .spawn()? + .join(&mut pgrp); + } + + debug!("Restoring filesystem"); + spawn_untar(img_streamer.tar_fs_pipe.unwrap())? + .wait_for_success()?; + debug!("Filesystem restored"); + + // The file system is back, including the application configuration containing user-defined + // preserved-paths, and application time offset. + // We load the app config, add the new preserved_paths, and save it. It will be useful for the + // subsequent checkpoint. + let mut config = AppConfig::restore()?; + config.image_url = image_url; + config.preserved_paths.extend(preserved_paths); + config.save()?; + + // Adjust the libtimevirt offsets + debug!("Application clock: {:.1}s", + Duration::from_nanos(config.app_clock as u64).as_secs_f64()); + virt::time::ConfigPath::default().adjust_timespecs(config.app_clock)?; + + // We start the ns_last_pid daemon here. Note that we join_as_daemon() instead of join(), + // this is so we don't wait for it in wait_for_success(). + debug!("Starting set_ns_last_pid server"); + spawn_set_ns_last_pid_server()? + .join_as_daemon(&mut pgrp); + + debug!("Continuing reading image in memory..."); + + let stats = img_streamer.progress.wait_for_stats()?; + stats.show(); + + // Wait for the imager to be ready. + img_streamer.progress.wait_for_socket_init()?; + + // Restore processes. We become the parent of the application as CRIU + // is configured to use CLONE_PARENT. + // If we fail, we kill whatever is left of the application. + debug!("Restoring processes"); + criu::spawn_restore(leave_stopped)? + .join(&mut pgrp); + + // Wait for all our all our monitored processes to finish. + // If there's an issue, kill the app if it's still laying around. + // We might want to check that we are the parent of the process with pid APP_ROOT_PID, + // otherwise, we might be killing an innocent process. But that would be racy anyways. + if let Err(e) = pgrp.wait_for_success() { + let _ = killpg(Pid::from_raw(APP_ROOT_PID), signal::SIGKILL); + return Err(e); + } + + info!("Application is ready, restore took {:.1}s", START_TIME.elapsed().as_secs_f64()); + + Ok(stats) +} + +/// `monitor_app()` assumes the init role. We do the following: +/// 1) We proxy signals we receive to our child pid=APP_ROOT_PID. +/// 2) We reap processes that get reparented to us. +/// 3) When APP_ROOT_PID dies, we return an error that contains the appropriate exit_code. +/// (even when the application exited with 0. It makes the code simpler). +fn monitor_app() -> Result<()> { + for sig in Signal::iterator() { + // We don't forward SIGCHLD, and neither `FORBIDDEN` signals (e.g., + // SIGSTOP, SIGFPE, SIGKILL, ...) + if sig == Signal::SIGCHLD || signal_hook::FORBIDDEN.contains(&(sig as c_int)) { + continue; + } + + // Forward signal to our child. + // The `register` function is unsafe because one could call malloc(), + // and deadlock the program. Here we call kill() which is safe. + unsafe { + signal_hook::register(sig as c_int, move || { + let _ = kill(Pid::from_raw(APP_ROOT_PID), sig); + })?; + } + } + pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&SigSet::all()), None)?; + + // Helper function used in the loop + fn child_exited anyhow::Error>(pid: Pid, app_exited_f: F) -> Result<()> { + if pid.as_raw() == APP_ROOT_PID { + // kill remaining orphans: They belong to the process group that we + // made with setsid() in run_from_scratch(). + // TODO Check if that's actually necessary. + let _ = killpg(pid, signal::SIGKILL); + Err(app_exited_f()) + } else { + Ok(()) + } + } + + loop { + match wait()? { + WaitStatus::Exited(pid, exit_status) => + child_exited(pid, || { + anyhow!("Application exited with exit_code={}", exit_status) + .context(ExitCode(exit_status as u8)) + })?, + WaitStatus::Signaled(pid, signal, _core_dumped) => + child_exited(pid, || { + anyhow!("Application caught fatal signal {}", signal) + .context(ExitCode(128 + signal as u8)) + })?, + _ => {}, + }; + } +} + +fn run_from_scratch( + image_url: String, + preserved_paths: HashSet, + app_cmd: Vec, +) -> Result<()> +{ + let config = AppConfig { + image_url, + preserved_paths, + app_clock: 0, + }; + config.save()?; + + virt::time::ConfigPath::default().write_intial()?; + virt::enable_system_wide_virtualization()?; + + let mut cmd = Command::new(app_cmd); + if let Some(path) = std::env::var_os("FF_APP_PATH") { + cmd.env_remove("FF_APP_PATH") + .env("PATH", path); + } + if let Some(library_path) = std::env::var_os("FF_APP_LD_LIBRARY_PATH") { + cmd.env_remove("FF_APP_LD_LIBRARY_PATH") + .env("LD_LIBRARY_PATH", library_path); + } + cmd.setsid(); + cmd.spawn_with_pid(APP_ROOT_PID)?; + + info!("Application is ready, started from scratch"); + + Ok(()) +} + +pub enum RunMode { + Restore { shard_download_cmds: Vec }, + FromScratch, +} + +pub fn determine_run_mode(image_url: &str, allow_bad_image_version: bool) -> Result { + let store = store::from_url(&image_url)?; + + info!("Fetching image manifest for {}", image_url); + + let fetch_result = with_metrics("fetch_manifest", + || ImageManifest::fetch_from_store(&*store, allow_bad_image_version), + |fetch_result| match fetch_result { + ManifestFetchResult::Some(_) => json!({"manifest": "good", "run_mode": "restore"}), + ManifestFetchResult::VersionMismatch {..} => json!({"manifest": "version_mismatch", "run_mode": "run_from_scratch"}), + ManifestFetchResult::NotFound => json!({"manifest": "not_found", "run_mode": "run_from_scratch"}), + } + )?; + + Ok(match fetch_result { + ManifestFetchResult::Some(img_manifest) => { + debug!("Image manifest found: {:?}", img_manifest); + let shard_download_cmds = shard::download_cmds(&img_manifest, &*store); + RunMode::Restore { shard_download_cmds } + } + ManifestFetchResult::VersionMismatch { fetched, desired } => { + info!("Image manifest found, but has version {} while the expected version is {}. \ + You may try again with --allow-bad-image-version. \ + Running application from scratch", fetched, desired); + RunMode::FromScratch + } + ManifestFetchResult::NotFound => { + info!("Image manifest not found, running application from scratch"); + RunMode::FromScratch + } + }) +} + +fn ensure_non_conflicting_pid() -> Result<()> { + // We don't want to use a PID that could be potentially used by the + // application when being restored. + if std::process::id() > APP_ROOT_PID as u32 { + // We should be pid=1 in a container, so this code block only applies when running + // outside of a container. + set_ns_last_pid(MIN_PID)?; + bail!("Current pid is too high. Re-run the same command again."); + } + + Ok(()) +} + +impl super::CLI for Run { + fn run(self) -> Result<()> { + let Self { + image_url, app_args, on_app_ready_cmd, no_restore, + allow_bad_image_version, preserved_paths, leave_stopped, verbose: _, + detach } = self; + + let preserved_paths = preserved_paths.into_iter().collect(); + + // Holding the lock while invoking any process (e.g., `spawn_smoke_check`) is + // preferrable to avoid disturbing another instance of FastFreeze trying + // to do PID control. + with_checkpoint_restore_lock(|| { + criu::spawn_smoke_check()? + .wait_for_success()?; + + ensure_non_conflicting_pid()?; + + // We prepare the store for writes to speed up checkpointing. Notice that + // we also prepare the store during restore, because we want to make sure + // we can checkpoint after a restore. + trace!("Preparing image store"); + store::from_url(&image_url)?.prepare(true)?; + + let run_mode = if no_restore { + info!("Running app from scratch as specified with --no-restore"); + RunMode::FromScratch + } else { + determine_run_mode(&image_url, allow_bad_image_version) + .context(ExitCode(EXIT_CODE_RESTORE_FAILURE))? + }; + + match run_mode { + RunMode::Restore { shard_download_cmds } => { + with_metrics("restore", || + restore(image_url, preserved_paths, shard_download_cmds, leave_stopped) + .context(ExitCode(EXIT_CODE_RESTORE_FAILURE)), + |stats| json!({"stats": stats}))?; + } + RunMode::FromScratch => { + let app_args = app_args.into_iter().map(|s| s.into()).collect(); + with_metrics("run_from_scratch", || + run_from_scratch(image_url, preserved_paths, app_args), + |_| json!({}))?; + } + } + + Ok(()) + })?; + + if let Some(on_app_ready_cmd) = on_app_ready_cmd { + // Fire and forget. + Command::new_shell(&on_app_ready_cmd) + .spawn()?; + } + + // detach is only used for integration tests + if !detach { + monitor_app()?; + } + + Ok(()) + } +} diff --git a/src/cli/wait.rs b/src/cli/wait.rs new file mode 100644 index 0000000..95921e4 --- /dev/null +++ b/src/cli/wait.rs @@ -0,0 +1,41 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::time::{Instant, Duration}; +use structopt::StructOpt; +use serde::Serialize; +use crate::lock::checkpoint_restore_lock; + + +/// Wait for checkpoint or restore to finish +#[derive(StructOpt, PartialEq, Debug, Serialize)] +pub struct Wait { + /// Fail after some specified number of seconds. Decimals are allowed + #[structopt(short, long)] + timeout: Option, + + /// Verbosity. Can be repeated + #[structopt(short, long, parse(from_occurrences))] + pub verbose: u8, +} + +impl super::CLI for Wait { + fn run(self) -> Result<()> { + let Self { timeout, verbose: _ } = self; + let timeout = timeout.map(|t| Instant::now() + Duration::from_secs_f64(t)); + let _lock_guard = checkpoint_restore_lock(timeout, false)?; + Ok(()) + } +} diff --git a/src/consts.rs b/src/consts.rs new file mode 100644 index 0000000..53ce490 --- /dev/null +++ b/src/consts.rs @@ -0,0 +1,94 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + time::Instant, + path::PathBuf, +}; +use crate::util::gen_random_alphanum_string; + +// This file gathers all fastfreeze hard-coded settings + +/// The image version must be bumped when libvirttime or libvirtcpuid change, +/// or when the `ImageManifest` format changes. +pub const CURRENT_IMG_VERSION: &str = "2020-08-14"; + +// We compute the paths at runtime. It improves readability compared to using +// macros at compile time. +lazy_static! { + // We pick /var/fastfreeze for our directory and not /tmp because we place the + // original elf loader there (see libvirtcpuid). So it has to be there after a reboot. + pub static ref FF_DIR: PathBuf = PathBuf::from("/var/fastfreeze"); + pub static ref NO_PRESERVE_FF_DIR: PathBuf = FF_DIR.join("run"); + + pub static ref APP_CONFIG_PATH: PathBuf = FF_DIR.join("app-config.json"); + pub static ref FF_LOG_DIR: PathBuf = FF_DIR.join("logs"); + + // XXX When changing any of the `LD` paths, libvirtcpuid must be recompiled + // See variables set in dist/Makefile + pub static ref LD_SYSTEM_PATH: PathBuf = PathBuf::from("/lib64/ld-linux-x86-64.so.2"); + pub static ref LD_SYSTEM_ORIG_PATH: PathBuf = NO_PRESERVE_FF_DIR.join( + LD_SYSTEM_PATH.file_name().unwrap()); + // This path is not necessarily ideal for root users as apparmor needs to be configured to + // whitelist this path. But for non-root users, it's best for doing kubernetes mounting. + pub static ref LD_VIRTCPUID_PATH: PathBuf = NO_PRESERVE_FF_DIR.join("ld-virtcpuid.so"); + pub static ref LIBVIRTCPUID_PATH: PathBuf = NO_PRESERVE_FF_DIR.join("libvirtcpuid.so"); + pub static ref LIBVIRTTIME_PATH: PathBuf = NO_PRESERVE_FF_DIR.join("libvirttime.so"); + + pub static ref LD_INJECT_ENV_PATH: PathBuf = FF_DIR.join("ld-inject.env"); + pub static ref VIRT_TIME_CONF_PATH: PathBuf = FF_DIR.join("virttime-conf.bin"); + + pub static ref CRIU_SOCKET_DIR: PathBuf = NO_PRESERVE_FF_DIR.clone(); + // XXX When changing this socket path, CRIU must be changed and recompiled. + pub static ref NS_LAST_PID_SOCK_PATH: PathBuf = NO_PRESERVE_FF_DIR.join("set_ns_last_pid.sock"); + pub static ref LOCK_FILE_PATH: PathBuf = NO_PRESERVE_FF_DIR.join("lock"); +} + +/// Arbitrary application PID. Has to be bigger than 300 due to the way we do PID control +pub const APP_ROOT_PID: i32 = 1000; + +/// When storing images, we use this filename to store our manifest +pub const MANIFEST_FILE_NAME: &str = "manifest.json"; + +/// Number of seconds to wait for processes to respond to a SIGTERM before sending a SIGKILL +pub const KILL_GRACE_PERIOD_SECS: u64 = 3; + +/// Exit code we return when encountering a fatal error. +/// We use 170 to distinguish from the application error codes. +pub const EXIT_CODE_FAILURE: u8 = 170; +/// Exit code to denote an error during restore. Meaning that passing --no-restore would help +/// running the application. +pub const EXIT_CODE_RESTORE_FAILURE: u8 = 171; + +lazy_static! { + /// The invocation ID is a random 6 digit alphanum string. It is is used in a few places: + /// 1) The shard prefix name + /// 2) The log file name + /// 3) Emitting metrics + pub static ref INVOCATION_ID: String = gen_random_alphanum_string(6); +} + +/// Where libraries like libvirttime.so and libvirtcpuid.so are searched +/// in addition to LD_LIBRARY_PATH. +pub const LIB_SEARCH_PATHS: &[&str] = &["/lib64", "/usr/lib", "/usr/local/lib"]; + +pub const KB: usize = 1024; +pub const MB: usize = 1024*1024; +pub const GB: usize = 1024*1024*1024; + +pub const PAGE_SIZE: usize = 4*KB; + +lazy_static! { + pub static ref START_TIME: Instant = Instant::now(); +} diff --git a/src/criu.rs b/src/criu.rs new file mode 100644 index 0000000..5a3e1ea --- /dev/null +++ b/src/criu.rs @@ -0,0 +1,86 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use crate::{ + consts::*, + process::{Command, Process}, +}; + +// CRIU is running under our CPUID virtualization. +// The CPUID that it detects is virtualized. + +pub fn spawn_dump() -> Result { + let mut cmd = Command::new(&[ + "criu", "dump", + "--tree", &APP_ROOT_PID.to_string(), + "--leave-stopped", // Leave app stopped: we resume app once the filesystem is tarred. + // The rest are some networking options. In a nutshell, we want all + // external connections to be closed on restore. + "--empty-ns", "net", "--tcp-established", "--skip-in-flight", "--tcp-close", "--ext-unix-sk" + ]); + + add_common_criu_opts(&mut cmd)?; + + cmd.spawn() +} + +pub fn spawn_restore(leave_stopped: bool) -> Result { + let mut cmd = Command::new(&[ + "criu", "restore", + "--restore-sibling", "--restore-detached", // Become parent of the app (CLONE_PARENT) + // The rest are some networking options. In a nutshell, we want all + // external connections to be closed on restore. + "--tcp-close", "--ext-unix-sk", + ]); + + if leave_stopped { + cmd.arg("--leave-stopped"); + } + + add_common_criu_opts(&mut cmd)?; + + cmd.spawn() +} + +fn add_common_criu_opts(cmd: &mut Command) -> Result<()> { + cmd.arg("--images-dir").arg(&*CRIU_SOCKET_DIR); + cmd.args(&[ + "--cpu-cap", // Save and check CPUID information in the image + "--shell-job", // Support attached TTYs + "--file-locks", // Support file locks + // CRIU has an experimental feature for checking file integrity. + // It can read the build-id in ELF headers during dump, and compare it during restore. + // Currently, it emits warnings during dump. So we'll skip it for now. + "--file-validation", "filesize", + "--stream", // Use criu-image-streamer + ]); + + if log_enabled!(log::Level::Trace) { + cmd.arg("-v"); // verbose + cmd.arg("--display-stats"); + } + + let extra_opts = std::env::var_os("CRIU_OPTS").unwrap_or_default(); + cmd.args(extra_opts.to_str() + .ok_or_else(|| anyhow!("CRIU_OPTS is UTF8 malformed"))? + .split_whitespace()); + + Ok(()) +} + +pub fn spawn_smoke_check() -> Result { + Command::new(&["criu", "check"]) + .spawn() +} diff --git a/src/filesystem.rs b/src/filesystem.rs new file mode 100644 index 0000000..d10f7e4 --- /dev/null +++ b/src/filesystem.rs @@ -0,0 +1,61 @@ + +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::{ + path::PathBuf, + collections::HashSet, + fs, +}; +use crate::{ + consts::*, + process::{Command, Process, Stdio}, +}; + +pub fn spawn_tar(preserved_paths: HashSet, stdout: fs::File) -> Result { + let mut cmd = Command::new(&["tar"]); + if log_enabled!(log::Level::Trace) { + cmd.arg("--verbose"); + } + cmd.args(&[ + "--directory", "/", + "--create", + "--preserve-permissions", + "--ignore-failed-read", // Allows us to discard EPERM errors of files in /tmp + "--sparse", // Support sparse files efficiently, libvirttime uses one + "--file", "-", + ]) + .arg("--exclude").arg(&*NO_PRESERVE_FF_DIR) + .args(&preserved_paths) + .arg(&*FF_DIR) + .stdout(Stdio::from(stdout)) + .spawn() +} + +pub fn spawn_untar(stdin: fs::File) -> Result { + let mut cmd = Command::new(&["tar"]); + if log_enabled!(log::Level::Trace) { + cmd.arg("--verbose"); + } + cmd.args(&[ + "--directory", "/", + "--extract", + "--preserve-permissions", + "--no-overwrite-dir", + "--file", "-", + ]) + .stdin(Stdio::from(stdin)) + .spawn() +} diff --git a/src/image/compressor.rs b/src/image/compressor.rs new file mode 100644 index 0000000..359239a --- /dev/null +++ b/src/image/compressor.rs @@ -0,0 +1,72 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Serialize, Deserialize}; +use std::str::FromStr; + +#[derive(Debug, Serialize, Deserialize)] +pub enum Compressor { + None, + Lz4, + Zstd, +} + +impl Compressor { + pub fn compress_cmd(&self) -> Option<&str> { + match self { + Compressor::None => None, + Compressor::Lz4 => Some("lz4 -1 - -"), + Compressor::Zstd => Some("zstd -1 - -"), + } + } + + pub fn decompress_cmd(&self) -> Option<&str> { + match self { + Compressor::None => None, + Compressor::Lz4 => Some("lz4 -d - -"), + Compressor::Zstd => Some("zstd -d - -"), + } + } +} + +impl From for Compressor { + fn from(cpu_budget: CpuBudget) -> Self { + match cpu_budget { + CpuBudget::Low => Compressor::None, + CpuBudget::Medium => Compressor::Lz4, + CpuBudget::High => Compressor::Zstd, + } + } +} + + +#[derive(Debug, PartialEq, Copy, Clone, Serialize)] +pub enum CpuBudget { + Low, + Medium, + High, +} + +impl FromStr for CpuBudget { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::result::Result { + Ok(match s { + "low" => CpuBudget::Low, + "medium" => CpuBudget::Medium, + "high" => CpuBudget::High, + _ => bail!("Possible values are [low, medium, high], not `{}`", s) + }) + } +} diff --git a/src/image/manifest.rs b/src/image/manifest.rs new file mode 100644 index 0000000..99fa849 --- /dev/null +++ b/src/image/manifest.rs @@ -0,0 +1,87 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use serde::{Serialize, Deserialize}; +use crate::{ + consts::*, + store::{Store, FileExt}, +}; +use super::Compressor; + +// The image manifest is what describes how to consume an image. +// It holds version, shard location, and compression used. + +pub enum ManifestFetchResult { + Some(ImageManifest), + VersionMismatch { fetched: String, desired: String }, + NotFound, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ImageManifest { + pub version: String, + pub num_shards: u32, + pub compressor: Compressor, + pub shard_prefix: String, +} + +impl ImageManifest { + /// Make a new image manifest. The shard_prefix is INVOCATION_ID which is picked at random. + /// This can make it easier to tie metrics and log files to a specific checkpoint command. + pub fn new(num_shards: u32, compressor: Compressor) -> Self { + Self { + version: String::from(CURRENT_IMG_VERSION), + shard_prefix: INVOCATION_ID.clone(), + compressor, + num_shards, + } + } + + pub fn to_json(&self) -> String { + // unwrap() is safe. The JSON serialization can't fail. + serde_json::to_string(self).unwrap() + } + + pub fn from_json(manifest_json: &str, allow_bad_image_version: bool) -> Result { + use ManifestFetchResult::*; + + // We first parse the JSON uninterpreted to check for the version. + // If we have a match, we proceed to destructuring the JSON into our ImageDescriptor. + let manifest: serde_json::Value = serde_json::from_str(manifest_json) + .with_context(|| format!("Malformed json: {}", manifest_json))?; + + Ok(if manifest["version"] == CURRENT_IMG_VERSION || allow_bad_image_version { + let manifest = serde_json::from_value(manifest) + .with_context(|| format!("Failed to parse image descriptor: {}", manifest_json))?; + Some(manifest) + } else { + VersionMismatch { + fetched: manifest["version"].to_string(), + desired: CURRENT_IMG_VERSION.to_string(), + } + }) + } + + pub fn persist_to_store(&self, store: &dyn Store) -> Result<()> { + store.file(MANIFEST_FILE_NAME).write(&self.to_json().as_bytes()) + } + + pub fn fetch_from_store(store: &dyn Store, allow_bad_image_version: bool) -> Result { + Ok(match store.file(MANIFEST_FILE_NAME).try_read()? { + Some(manifest_json) => Self::from_json(&String::from_utf8_lossy(&manifest_json), allow_bad_image_version)?, + None => ManifestFetchResult::NotFound, + }) + } +} diff --git a/src/image/mod.rs b/src/image/mod.rs new file mode 100644 index 0000000..b39a0b4 --- /dev/null +++ b/src/image/mod.rs @@ -0,0 +1,20 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod compressor; +mod manifest; +pub mod shard; + +pub use manifest::{ManifestFetchResult, ImageManifest}; +pub use compressor::{Compressor, CpuBudget}; diff --git a/src/image/shard.rs b/src/image/shard.rs new file mode 100644 index 0000000..fdf2276 --- /dev/null +++ b/src/image/shard.rs @@ -0,0 +1,43 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::ImageManifest; +use crate::store::Store; + +fn shard_filename(shard_prefix: &str, shard_index: u32) -> String { + // .ffs stands for fastfreeze shard + format!("{}-{}.ffs", shard_prefix, shard_index+1) +} + +pub fn upload_cmds(img_desc: &ImageManifest, store: &dyn Store) -> Vec { + (0..img_desc.num_shards).map(|shard_index| { + let file = store.file(&shard_filename(&img_desc.shard_prefix, shard_index)); + + match img_desc.compressor.compress_cmd() { + Some(comp_cmd) => format!("{} | {}", comp_cmd, file.upload_shell_cmd()), + None => file.upload_shell_cmd(), + } + }).collect() +} + +pub fn download_cmds(img_desc: &ImageManifest, store: &dyn Store) -> Vec { + (0..img_desc.num_shards).map(|shard_index| { + let file = store.file(&shard_filename(&img_desc.shard_prefix, shard_index)); + + match img_desc.compressor.decompress_cmd() { + Some(decomp_cmd) => format!("{} | {}", file.download_shell_cmd(), decomp_cmd), + None => file.download_shell_cmd(), + } + }).collect() +} diff --git a/src/image_streamer.rs b/src/image_streamer.rs new file mode 100644 index 0000000..d3f5e44 --- /dev/null +++ b/src/image_streamer.rs @@ -0,0 +1,221 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + os::unix::io::{RawFd, AsRawFd}, + fs, io::BufReader, + io::BufRead, + io::Lines, path::Path, +}; +use serde::{Serialize, Deserialize}; +use crate::{ + consts::*, + util::Pipe, + process::{Command, Process, PipeCommandExt}, +}; + + +pub struct Progress { + pub fd: RawFd, + pub lines: Lines>, +} + +impl Progress { + fn get_next_progress_line(&mut self) -> Result { + Ok(self.lines.next() + .ok_or_else(|| anyhow!("EOF unexpectedly reached")) + .context("Failed to read progress from the streamer")??) + } + + pub fn wait_for_socket_init(&mut self) -> Result<()> { + ensure!(self.get_next_progress_line()? == "socket-init", + "criu-image-streamer failed to initialize"); + Ok(()) + } + + pub fn wait_for_checkpoint_start(&mut self) -> Result<()> { + ensure!(self.get_next_progress_line()? == "checkpoint-start", + "criu-image-streamer failed to send start message"); + Ok(()) + } + + pub fn wait_for_stats(&mut self) -> Result { + let stats_json = self.get_next_progress_line()?; + Ok(serde_json::from_str::(&stats_json)?.into()) + } +} + +pub struct ImageStreamer { + pub process: Process, + pub progress: Progress, + pub tar_fs_pipe: Option, + pub shard_pipes: Vec, +} + +impl ImageStreamer { + pub fn spawn_capture(num_shards: usize) -> Result { + let progress = Pipe::new_output()?; + let fs_tar = Pipe::new_input()?; + + let shards = (0..num_shards) + .map(|_| Pipe::new_output()) + .collect::>>()?; + + let mut cmd = Command::new(&[ + "criu-image-streamer", + "--progress-fd", &progress.write.as_raw_fd().to_string(), + "--ext-file-fds", &format!("fs.tar:{}", fs_tar.read.as_raw_fd()), + "--shard-fds", &shards.iter() + .map(|o| o.write.as_raw_fd().to_string()) + .collect::>().join(","), + ]); + cmd + .arg("--images-dir").arg(&*CRIU_SOCKET_DIR) + .arg("capture"); + + Ok(Self { + process: cmd.spawn()?, + progress: Progress { + fd: progress.read.as_raw_fd(), + lines: BufReader::new(progress.read).lines(), + }, + tar_fs_pipe: Some(fs_tar.write), + shard_pipes: shards.into_iter().map(|o| o.read).collect(), + }) + } + + pub fn spawn_serve(num_shards: usize) -> Result { + let progress = Pipe::new_output()?; + let fs_tar = Pipe::new_output()?; + + let shards = (0..num_shards) + .map(|_| Pipe::new_input()) + .collect::>>()?; + + let mut cmd = Command::new(&[ + "criu-image-streamer", + "--progress-fd", &progress.write.as_raw_fd().to_string(), + "--ext-file-fds", &format!("fs.tar:{}", fs_tar.write.as_raw_fd()), + "--shard-fds", &shards.iter() + .map(|o| o.read.as_raw_fd().to_string()) + .collect::>().join(","), + ]); + cmd + .arg("--images-dir").arg(&*CRIU_SOCKET_DIR) + .arg("serve"); + + Ok(Self { + process: cmd.spawn()?, + progress: Progress { + fd: progress.read.as_raw_fd(), + lines: BufReader::new(progress.read).lines(), + }, + tar_fs_pipe: Some(fs_tar.read), + shard_pipes: shards.into_iter().map(|o| o.write).collect(), + }) + } + + pub fn spawn_extract(num_shards: usize, output_dir: &Path) -> Result { + let progress = Pipe::new_output()?; + + let shards = (0..num_shards) + .map(|_| Pipe::new_input()) + .collect::>>()?; + + let mut cmd = Command::new(&[ + "criu-image-streamer", + "--progress-fd", &progress.write.as_raw_fd().to_string(), + "--shard-fds", &shards.iter() + .map(|o| o.read.as_raw_fd().to_string()) + .collect::>().join(","), + "--images-dir" + ]); + cmd.arg(output_dir) + .arg("extract"); + + Ok(Self { + process: cmd.spawn()?, + progress: Progress { + fd: progress.read.as_raw_fd(), + lines: BufReader::new(progress.read).lines(), + }, + tar_fs_pipe: None, + shard_pipes: shards.into_iter().map(|o| o.write).collect(), + }) + } +} + +#[derive(Serialize, Deserialize)] +pub struct ImageStreamerStats { + pub shards: Vec, +} +#[derive(Serialize, Deserialize)] +pub struct ImageStreamerShardStat { + pub size: u64, + pub transfer_duration_millis: u128, +} + +// These are emitted for metrics +#[derive(Serialize, Deserialize)] +pub struct Stats { + pub total_size_mb: f64, + pub total_duration_sec: f64, + pub rate_mb_per_sec: f64, + pub shards: Vec, +} +#[derive(Serialize, Deserialize)] +pub struct ShardStat { + pub size_mb: f64, + pub duration_sec: f64, + pub rate_mb_per_sec: f64, +} + +impl Stats { + pub fn show(&self) { + info!("Uncompressed image size is {:.0} MiB, rate: {:.0} MiB/s", + self.total_size_mb, self.rate_mb_per_sec); + + if log_enabled!(log::Level::Debug) && self.shards.len() > 1 { + for (i, shard) in self.shards.iter().enumerate() { + debug!(" Shard {}: {:.0} MiB, rate: {:.0} MiB/s", + i+1, shard.size_mb, shard.rate_mb_per_sec); + } + } + + // To show the compressed rates, we need to examine the output pipes. + // But that will cost us some CPU overhead as there's no way to get + // stats on a kernel pipe, to my knowledge. + } +} + +impl From for Stats { + fn from(stats: ImageStreamerStats) -> Self { + let total_size: u64 = stats.shards.iter().map(|s| s.size).sum(); + let total_duration_millis = stats.shards.iter().map(|s| s.transfer_duration_millis).max().unwrap_or(0); + + let total_size_mb = total_size as f64 / MB as f64; + let total_duration_sec = total_duration_millis as f64 / 1000.0; + let rate_mb_per_sec = if total_duration_sec == 0.0 { 0.0 } else { total_size_mb / total_duration_sec }; + + let shards = stats.shards.into_iter().map(|s| { + let size_mb = s.size as f64 / MB as f64; + let duration_sec = s.transfer_duration_millis as f64 / 1000.0; + let rate_mb_per_sec = if duration_sec == 0.0 { 0.0 } else { size_mb / duration_sec }; + ShardStat { size_mb, duration_sec, rate_mb_per_sec } + }).collect::>(); + + Self { total_size_mb, total_duration_sec, rate_mb_per_sec, shards } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..18b97c7 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,40 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// We have both a lib.rs and main.rs to make writing integration tests possible. +// The integration tests compile to a separate program using this fastfreeze library. + +pub mod logger; +pub mod util; +pub mod process; +pub mod cli; +pub mod store; +pub mod image; +pub mod virt; +pub mod metrics; +pub mod consts; +pub mod criu; +pub mod filesystem; +pub mod image_streamer; +pub mod lock; +pub mod signal; + +#[macro_use] +extern crate anyhow; +#[macro_use] +extern crate log; +#[macro_use] +extern crate lazy_static; +#[macro_use] +extern crate serde_json; diff --git a/src/lock.rs b/src/lock.rs new file mode 100644 index 0000000..38189d7 --- /dev/null +++ b/src/lock.rs @@ -0,0 +1,103 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + os::unix::io::AsRawFd, + time::{Instant, Duration}, + path::Path, + fs, +}; +use nix::{ + Error, + errno::Errno, + fcntl::{flock, FlockArg} +}; +use crate::{ + consts::*, + signal::check_for_pending_sigterm, +}; + +#[must_use = "if unused, the lock will immediately unlock"] +/// When `FileLockGuard` is dropped, the corresponding `fs::File` is closed, unlocking the file. +pub struct FileLockGuard(fs::File); + +#[derive(Debug)] +struct LockTimeoutError; +impl std::error::Error for LockTimeoutError {} +impl std::fmt::Display for LockTimeoutError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Lock timeout exeeded") + } +} + +pub fn file_lock(path: &Path, timeout: Option, exclusive: bool) + -> Result +{ + // Instead of using alarm() to support timeout, we use the non-blocking + // version of flock to avoid races. + let flag = match (timeout.is_some(), exclusive) { + (true, true) => FlockArg::LockExclusiveNonblock, + (true, false) => FlockArg::LockSharedNonblock, + (false, true) => FlockArg::LockExclusive, + (false, false) => FlockArg::LockShared, + }; + + let file = fs::File::create(path) + .with_context(|| format!("Failed to create lock file {}. \ + Run `fastfreeze install` first", path.display()))?; + + trace!("Waiting to acquire file lock at {}", path.display()); + + loop { + check_for_pending_sigterm()?; + + match (flock(file.as_raw_fd(), flag), timeout.as_ref()) { + (Err(Error::Sys(Errno::EAGAIN)), Some(timeout)) => { + ensure!(Instant::now() < *timeout, LockTimeoutError); + std::thread::sleep(Duration::from_millis(100)); + }, + (Err(Error::Sys(Errno::EINTR)), _) => {}, + (Err(e), _) => bail!(e), + (Ok(_), _) => break, + } + } + + Ok(FileLockGuard(file)) +} + +pub fn checkpoint_restore_lock(timeout: Option, exclusive: bool) + -> Result +{ + file_lock(&*LOCK_FILE_PATH, timeout, exclusive).map_err(|e| + match e.downcast::() { + Ok(_) => anyhow!("Previous checkpoint/restore operation still in progress"), + Err(e) => e, + } + ) +} + +pub fn with_checkpoint_restore_lock(f: F) -> Result + where F: FnOnce() -> Result, +{ + let _lock_guard = { + // We use a 1 second timeout because we could be racing with a "fastfreeze + // wait" command, which holds the lock for a tiny amount of time. Otherwise, + // we would use 0 timeout. + let timeout = Some(Instant::now() + Duration::from_secs(1)); + checkpoint_restore_lock(timeout, true)? + }; + + f() +} diff --git a/src/logger.rs b/src/logger.rs new file mode 100644 index 0000000..76390e8 --- /dev/null +++ b/src/logger.rs @@ -0,0 +1,112 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::{ + io::prelude::*, + io::stderr, + sync::Mutex, + fs, +}; +use log::{Record, Metadata}; +pub use log::LevelFilter; +use chrono::prelude::*; +use crate::{ + consts::*, + util::create_dir_all, +}; + + +pub struct Logger { + cmd_name: &'static str, + log_file: Option>, +} + +impl log::Log for Logger { + fn enabled(&self, _metadata: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + let msg = format!("[ff.{}] ({:.3}s) {}\n", + self.cmd_name, START_TIME.elapsed().as_secs_f64(), record.args()); + + // When writing our log outputs fail, we dismiss the errors. + // Maybe there's something better to do. + let _ = stderr().write_all(msg.as_bytes()); + let _ = self.log_file.as_ref().map(|f| + f.lock().unwrap().write_all(msg.as_bytes())); + } + + fn flush(&self) { + let _ = stderr().flush(); + let _ = self.log_file.as_ref().map(|f| + f.lock().unwrap().flush()); + } +} + +fn open_log_file(cmd_name: &str) -> Result { + create_dir_all(&*FF_LOG_DIR)?; + + // We pick a random log filename. This is because the log file is saved in the checkpoint + // image. When we restore, we need to preserve the previous log. Having different log files + // makes it easier to do so. + let log_file = FF_LOG_DIR.join( + format!("ff-{}-{}-{}.log", + Utc::now().format("%Y%m%d-%H%M%S"), + cmd_name, + &*INVOCATION_ID)); + + Ok(fs::OpenOptions::new() + .create(true) + .append(true) + .open(log_file)?) +} + +pub fn init(level: LevelFilter, cmd_name: &'static str, use_log_file: bool) { + let (log_file, log_file_error) = if use_log_file { + // If we can't open the log file, we can't report the failure yet + // as the logger is not yet initialized. So we stash the error, + // and log it later. + match open_log_file(cmd_name) { + Ok(f) => (Some(f), None), + Err(e) => (None, Some(e)), + } + } else { + (None, None) + }; + + let log_file = log_file.map(Mutex::new); + let logger = Logger { cmd_name, log_file }; + + // An error is returned when the logger has already been initialized. + // Initializing the logger twice would be a logic error, so it's safe to unwrap(). + log::set_boxed_logger(Box::new(logger)).unwrap(); + log::set_max_level(level); + + if let Some(err) = log_file_error { + warn!("WARN: Failed to open the log file at {}: {}", + FF_LOG_DIR.display(), err); + } + + if use_log_file { + let host = hostname::get().map_or_else( + |err| format!("<{}>", err), + |h| h.to_string_lossy().to_string()); + + warn!("Time is {}", Utc::now().to_rfc2822()); + warn!("Host is {}", host); + warn!("Invocation ID is {}", &*INVOCATION_ID); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..d328298 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,72 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod logger; +pub mod util; +pub mod process; +pub mod cli; +pub mod store; +pub mod image; +pub mod virt; +pub mod metrics; +pub mod consts; +pub mod criu; +pub mod filesystem; +pub mod image_streamer; +pub mod lock; +pub mod signal; + +#[macro_use] +extern crate anyhow; +#[macro_use] +extern crate log; +#[macro_use] +extern crate lazy_static; +#[macro_use] +extern crate serde_json; + +use anyhow::Result; +use structopt::StructOpt; + +use crate::{ + consts::*, + cli::{ExitCode, CLI}, + virt::disable_local_time_virtualization, + signal::trap_sigterm_and_friends, +}; + +fn main() { + fn do_main() -> Result<()> { + // We have to be exempt from time virtualization because we use + // `Instant::now()`, which uses CLOCK_MONOTONIC. + // disable_local_time_virtualization() does an execve() if needed. + disable_local_time_virtualization()?; + + // START_TIME is used for logging purposes + lazy_static::initialize(&START_TIME); + + // Trapping signals is important for cleanups (e.g., kill children) before we exit + trap_sigterm_and_friends()?; + + let opts = cli::Opts::from_args(); + opts.init_logger(); + opts.run() + } + + if let Err(e) = do_main() { + log::error!("{:#}", e); + let exit_code = ExitCode::from_error(&e).unwrap_or(EXIT_CODE_FAILURE); + std::process::exit(exit_code as i32); + } +} diff --git a/src/metrics.rs b/src/metrics.rs new file mode 100644 index 0000000..d4a7a54 --- /dev/null +++ b/src/metrics.rs @@ -0,0 +1,86 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + ffi::OsString, + time::Instant, +}; +use crate::{ + consts::*, + process::{Process, Command}, + util::JsonMerge, +}; +use serde_json::Value; + +lazy_static! { + static ref METRICS_RECORDER_PATH: Option = + std::env::var_os("FF_METRICS_RECORDER"); + + static ref ARGS_JSON: Value = + serde_json::to_value(std::env::args().collect::>()) + .expect("Failed to serialize CLI arguments into json"); +} + +pub fn emit_metrics(event: Value) -> Result> { + let metrics_recorder_path = match METRICS_RECORDER_PATH.as_ref() { + Some(path) => path, + None => return Ok(None), + }; + + let payload = json!({ + "invocation_id": *INVOCATION_ID, + "elapsed_time": START_TIME.elapsed().as_secs_f64(), + "cli_args": *ARGS_JSON, + "event": event, + }); + + let p = Command::new(&[metrics_recorder_path]) + .arg(&serde_json::to_string(&payload)?) + .show_cmd_on_spawn(log_enabled!(log::Level::Trace)) + .spawn() + .context("Failed to spawn the metrics program")?; + + Ok(Some(p)) +} + +pub fn with_metrics(action: &str, f: F, metrics_f: M) -> Result + where F: FnOnce() -> Result, + M: Fn(&R) -> Value +{ + if METRICS_RECORDER_PATH.is_none() { + return f(); + } + + let start_time = Instant::now(); + let result = f(); + let event = json!({ + "action": action, + "duration": start_time.elapsed().as_secs_f64(), + }).merge(match &result { + Ok(result) => json!({ + "outcome": "success", + }).merge(metrics_f(result)), + Err(e) => json!({ + "outcome": "error", + "msg": e.to_string(), + }), + }); + + // If the metrics CLI fails, we don't return the error to the caller. + // Instead, we log the error and move on. + emit_metrics(event)?.map(|p| p.reap_on_drop()); + + result +} diff --git a/src/process/command.rs b/src/process/command.rs new file mode 100644 index 0000000..8158b7b --- /dev/null +++ b/src/process/command.rs @@ -0,0 +1,166 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + io::Result as IoResult, + io::Error as IoError, + os::unix::io::AsRawFd, + ffi::{OsString, OsStr}, + collections::HashMap, + process::Command as StdCommand, + os::unix::process::CommandExt, +}; +use nix::{ + fcntl::{fcntl, FcntlArg, FdFlag, OFlag}, + unistd::setsid, +}; +use crate::util::Pipe; +use super::Process; + +// We re-export these, as they are part of our API +pub use std::process::{ + ExitStatus, Stdio, ChildStdin, ChildStdout, ChildStderr, Output +}; + +pub type EnvVars = HashMap; + +// We wrap the standard library `Command` to provide additional features: +// * Logging of the command executed, and failures +// * set_pgrp() +// We have to delegate a few methods to the inner `StdCommand`, which makes it a bit verbose. +// We considered the subprocess crate, but it wasn't very useful, and it lacked +// the crucial feature of pre_exec() that the standard library has for doing setsid(). + +pub struct Command { + inner: StdCommand, + display_args: Vec, + show_cmd_on_spawn: bool, +} + +impl Command { + pub fn new, S: AsRef>(args: I) -> Self { + let mut args = args.into_iter(); + let program = args.next().unwrap(); // unwrap() is fine as we never pass empty args + let mut cmd = Self { + inner: StdCommand::new(&program), + display_args: vec![Self::arg_for_display(&program)], + show_cmd_on_spawn: true, + }; + cmd.args(args); + cmd + } + + pub fn new_shell>(script: S) -> Self { + // We use bash for pipefail support + let mut inner = StdCommand::new("/bin/bash"); + inner.arg("-o").arg("pipefail") + .arg("-c").arg(&script); + Self { + inner, + display_args: vec![Self::arg_for_display(&script)], + show_cmd_on_spawn: true, + } + } + + pub fn arg>(&mut self, arg: S) -> &mut Self { + self.display_args.push(Self::arg_for_display(&arg)); + self.inner.arg(&arg); + self + } + + pub fn arg_for_display>(arg: S) -> String { + arg.as_ref().to_string_lossy().into_owned() + } + + pub fn args, S: AsRef>(&mut self, args: I) -> &mut Self { + for arg in args { self.arg(arg); } + self + } + + pub fn setsid(&mut self) -> &mut Self { + unsafe { + self.pre_exec(|| match setsid() { + Err(e) => { + error!("Failed to setuid(): {}", e); + // Only errno is propagated back to the parent + Err(IoError::last_os_error()) + }, + Ok(_) => Ok(()), + }) + } + } + + pub fn show_cmd_on_spawn(&mut self, value: bool) -> &mut Self { + self.show_cmd_on_spawn = value; + self + } + + pub fn spawn(&mut self) -> Result { + let display_cmd = self.display_args.join(" "); + let inner = self.inner.spawn() + .with_context(|| format!("Failed to spawn `{}`", display_cmd))?; + if self.show_cmd_on_spawn { + debug!("+ {}", display_cmd); + } + Ok(Process::new(inner, display_cmd)) + } + + pub fn exec(&mut self) -> Result<()> { + bail!(self.inner.exec()) + } +} + +// These are delegates to the inner `StdCommand`. +impl Command { + pub fn env, V: AsRef>(&mut self, key: K, val: V) -> &mut Command + { self.inner.env(key, val); self } + pub fn envs, K: AsRef, V: AsRef>(&mut self, vars: I) -> &mut Command + { self.inner.envs(vars); self } + pub fn env_remove>(&mut self, key: K) -> &mut Command + { self.inner.env_remove(key); self } + pub fn env_clear(&mut self) -> &mut Command + { self.inner.env_clear(); self } + pub fn stdin>(&mut self, cfg: T) -> &mut Command + { self.inner.stdin(cfg); self } + pub fn stdout>(&mut self, cfg: T) -> &mut Command + { self.inner.stdout(cfg); self } + pub fn stderr>(&mut self, cfg: T) -> &mut Command + { self.inner.stderr(cfg); self } + pub unsafe fn pre_exec(&mut self, f: F) -> &mut Command + where + F: FnMut() -> IoResult<()> + Send + Sync + 'static + { self.inner.pre_exec(f); self } +} + +pub trait PipeCommandExt: Sized { + /// Create a new pipe input (e.g., stdin). + fn new_input() -> Result; + /// Create a new pipe output (e.g., stdout, stderr) + fn new_output() -> Result; +} + +impl PipeCommandExt for Pipe { + fn new_input() -> Result { + let pipe = Self::new(OFlag::empty())?; + fcntl(pipe.write.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::FD_CLOEXEC))?; + Ok(pipe) + } + + fn new_output() -> Result { + let pipe = Self::new(OFlag::empty())?; + fcntl(pipe.read.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::FD_CLOEXEC))?; + Ok(pipe) + } +} diff --git a/src/process/mod.rs b/src/process/mod.rs new file mode 100644 index 0000000..b1b215d --- /dev/null +++ b/src/process/mod.rs @@ -0,0 +1,23 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod command; +mod process_group; +mod process; +mod spawn_with_pid; + +pub use command::{Command, PipeCommandExt, Stdio, EnvVars}; +pub use process::{Process, Output}; +pub use process_group::{ProcessExt, ProcessGroup}; +pub use spawn_with_pid::{CommandPidExt, set_ns_last_pid, spawn_set_ns_last_pid_server, MIN_PID}; diff --git a/src/process/process.rs b/src/process/process.rs new file mode 100644 index 0000000..b6d2158 --- /dev/null +++ b/src/process/process.rs @@ -0,0 +1,187 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + time::{Duration, Instant}, + os::unix::process::ExitStatusExt, + convert::TryFrom, +}; +use nix::{ + sys::signal::{self, Signal}, unistd::Pid, +}; + +pub use std::process::{ + ExitStatus, + Stdio, + ChildStdin, + ChildStdout, + ChildStderr, + Output as StdOutput, + Child +}; +use crate::signal::{check_for_pending_sigterm, retry_on_interrupt}; + +// We create our own `Child` wrapper to provide better error context. +// We further expose a slightly different API than what is offered from the stdlib. +// to incorporate SIGTERM monitoring, and helpful error messages + +pub struct Process { + inner: Child, + display_cmd: String, +} + +impl Process { + pub fn new(inner: Child, display_cmd: String) -> Self { + Self { inner, display_cmd } + } + + pub fn pid(&self) -> i32 { self.inner.id() as i32 } + + pub fn kill(&self, signal: Signal) -> Result<()> { + signal::kill(Pid::from_raw(self.pid()), signal) + .with_context(|| format!("Failed to signal pid={}", self.pid())) + } + + pub fn try_wait(&mut self) -> Result> { + check_for_pending_sigterm()?; + self.inner.try_wait() + .with_context(|| format!("wait(pid={}) failed", self.pid())) + } + + pub fn wait(&mut self) -> Result { + retry_on_interrupt(|| { + check_for_pending_sigterm()?; + self.inner.wait() + .with_context(|| format!("wait(pid={}) failed", self.pid())) + }) + } + + pub fn wait_timeout(&mut self, until: Instant) -> Result> { + loop { + if let Some(exit_status) = self.try_wait()? { + return Ok(Some(exit_status)); + } + + if Instant::now() > until { + return Ok(None); + } + + std::thread::sleep(Duration::from_millis(100)); + } + } + + pub fn wait_for_success(&mut self) -> Result<()> { + let exit_status = self.wait()?; + ensure_successful_exit_status(exit_status, &self.display_cmd) + } + + pub fn wait_with_output(self) -> Result { + let Process { display_cmd, inner } = self; + + // FIXME `wait_with_output()` can read from stderr, and stdout and + // ignore if we received a SIGTERM. That's because `read_to_end()` is + // used internally, and ignores EINTR. + // That means that we won't act on SIGTERM. + check_for_pending_sigterm()?; + let result = inner.wait_with_output()?; + + Ok(Output { + status: result.status, + stdout: result.stdout, + stderr: result.stderr, + display_cmd, + }) + } + + pub fn reap_on_drop(self) -> ProcessDropReaper { + ProcessDropReaper { inner: self } + } + + // In the following, unwrap() is okay. It would be a logic error to access + // these without having setup the corresponding pipe. + pub fn stdin(&mut self) -> &mut ChildStdin { self.inner.stdin.as_mut().unwrap() } + pub fn stdout(&mut self) -> &mut ChildStdout { self.inner.stdout.as_mut().unwrap() } + pub fn stderr(&mut self) -> &mut ChildStderr { self.inner.stderr.as_mut().unwrap() } +} + +pub struct ProcessDropReaper { + inner: Process, +} + +impl Drop for ProcessDropReaper { + fn drop(&mut self) { + // If the process fails, we log the error and move on. + let _ = self.inner.wait_for_success() + .map_err(|e| error!("{}", e)); + } +} + +pub struct Output { + pub status: ExitStatus, + pub stdout: Vec, + pub stderr: Vec, + pub display_cmd: String, +} + +impl Output { + pub fn ensure_success(&self) -> Result<()> { + ensure_successful_exit_status(self.status, &self.display_cmd) + } +} + +fn ensure_successful_exit_status(exit_status: ExitStatus, display_cmd: &str) -> Result<()> { + if exit_status.success() { + Ok(()) + } else if let Some(exit_code) = exit_status.code() { + bail!("`{}` failed with exit_code={}", display_cmd, exit_code); + } else if let Some(signal) = exit_status.signal() { + let signal = Signal::try_from(signal as i32) + .map_or_else(|_| format!("signal {}", signal), |s| s.to_string()); + bail!("`{}` caught fatal {}", display_cmd, signal) + } else { + bail!("Unexpected child exit status {:?}", exit_status); + } +} + +#[cfg(test)] +mod test { + use super::*; + use super::super::*; + + #[test] + fn test_shell() -> Result<()> { + let mut cmd = Command::new_shell("exit `echo 33`").spawn()?; + let err_msg = cmd.wait_for_success().unwrap_err().to_string(); + + dbg!(&err_msg); + assert!(err_msg.contains("exit `echo 33`")); + assert!(err_msg.contains("exit_code=33")); + + Ok(()) + } + + #[test] + fn test_args() -> Result<()> { + let out = Command::new(&["echo", "-n", "hello"]) + .stdout(Stdio::piped()) + .spawn()? + .wait_with_output()? + .stdout; + + assert_eq!(String::from_utf8_lossy(&out), "hello"); + + Ok(()) + } +} diff --git a/src/process/process_group.rs b/src/process/process_group.rs new file mode 100644 index 0000000..50dcb73 --- /dev/null +++ b/src/process/process_group.rs @@ -0,0 +1,298 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + os::unix::io::AsRawFd, + io::{ErrorKind, Read}, + time::{Duration, Instant}, + fs, +}; +use nix::{ + poll::{PollFd, PollFlags}, + fcntl::OFlag, + sys::signal, +}; +use crate::{ + consts::*, + util::{poll_nointr, Pipe}, +}; +use super::Process; + +/// `ProcessGroup` is used for monitoring a group of processes. +/// When dropped, the whole group is killed, except non-killable children. +pub struct ProcessGroup { + /// We use a pipe to process SIGCHLD, because at some point we need to select() + /// on a pipe and watch for children to fail simultaneously. + pub sigchld_pipe: fs::File, + /// The list of children. When a child terminates, it is taken out from the list. + children: Vec, + /// When `ProcessGroup` is dropped, it sends a SIGTERM to the remaining + /// killable children. After kill_grace_period has elapsed, it sends a SIGKILL. + kill_grace_period: Duration, + /// Something to remember for unregistering the sigchld_pipe SIGCHLD. + sig_hook_id: signal_hook::SigId, +} + +pub struct ProcessMembership { + inner: Process, + /// When the process is marked as killable, it means that the process monitor + /// can kill it on drop(). This is useful to make CRIU immune to kills as it + /// could leave the application in a bad state. + killable: bool, + /// When the process is marked as daemon, it means that the process monitor + /// won't wait for this process to exit in wait_for_success(). + daemon: bool, +} + +impl From for ProcessMembership { + fn from(inner: Process) -> Self { + Self { inner, killable: true, daemon: false } + } +} + +impl ProcessMembership { + pub fn non_killable(self) -> Self { + Self { killable: false, ..self } + } + pub fn daemon(self) -> Self { + Self { daemon: true, ..self } + } +} + +impl ProcessGroup { + pub fn new() -> Result { + Self::with_kill_grace_period(Duration::from_secs(KILL_GRACE_PERIOD_SECS)) + } + + pub fn with_kill_grace_period(kill_grace_period: Duration) -> Result { + let pipe = Pipe::new(OFlag::O_CLOEXEC | OFlag::O_NONBLOCK)?; + let sig_hook_id = signal_hook::pipe::register(signal_hook::SIGCHLD, pipe.write) + .context("Failed to register signal")?; + + Ok(Self { + sigchld_pipe: pipe.read, + children: Vec::new(), + kill_grace_period, + sig_hook_id, + }) + } + + pub fn add(&mut self, proc: impl Into) -> &mut Self { + self.children.push(proc.into()); + self + } + + fn drain_sigchld_pipe(&mut self) { + // Discard the content of the pipe + let mut vec = Vec::new(); + match self.sigchld_pipe.read_to_end(&mut vec) { + Err(e) if e.kind() == ErrorKind::WouldBlock => {} + result => { result.expect("SIGCHLD pipe has draining issues"); } + } + } + + /// Returns an error if a process has exited with a failure. + /// Return Ok(true) if some children are remaining, Ok(false) otherwise. + pub fn try_wait_for_success(&mut self) -> Result { + self.drain_sigchld_pipe(); + + // partition() doesn't work well due to try_wait() mutablity and result. + let mut completed = Vec::new(); + let mut running = Vec::new(); + for mut child in self.children.drain(..) { + if child.inner.try_wait()?.is_some() { + completed.push(child); + } else { + running.push(child); + } + } + self.children = running; + + for mut child in completed { + child.inner.wait_for_success()?; + } + + Ok(self.children.iter().any(|c| !c.daemon)) + } + + pub fn wait_for_success(&mut self) -> Result<()> { + while self.try_wait_for_success()? { + let fd = self.sigchld_pipe.as_raw_fd(); + let timeout = -1; + poll_nointr(&mut [PollFd::new(fd, PollFlags::POLLIN)], timeout) + .context("Failed to poll()")?; + } + Ok(()) + } + + fn terminate_killable_gracefully(&mut self) -> Result<()> { + let (mut killables, non_killables) = self.children.drain(..) + .partition(|c| c.killable); + + self.children = non_killables; + + for child in &mut killables { + if child.inner.try_wait()?.is_none() { + // Sending the signal should not fail as our child is not reaped + // as try_wait() returned is none. + child.inner.kill(signal::SIGTERM)?; + } + } + + let deadline = Instant::now() + self.kill_grace_period; + for child in &mut killables { + if child.inner.wait_timeout(deadline)?.is_none() { + // Child didn't exit in time, it is getting a SIGKILL. + // kill() should not failed as our child is not reaped. + child.inner.kill(signal::SIGKILL)?; + child.inner.wait()?; + } + } + + Ok(()) + } + + pub fn terminate(&mut self) -> Result<()> { + self.terminate_killable_gracefully()?; + for child in &mut self.children { + child.inner.wait()?; + } + ensure!(signal_hook::unregister(self.sig_hook_id), + "signal_hook failed to unregister"); + Ok(()) + } +} + +impl Drop for ProcessGroup { + fn drop(&mut self) { + let _ = self.terminate() + .map_err(|e| error!("Skipping children termination: {}", e)); + } +} + +pub trait ProcessExt { + fn join(self, pgrp: &mut ProcessGroup); + fn join_as_non_killable(self, pgrp: &mut ProcessGroup); + fn join_as_daemon(self, pgrp: &mut ProcessGroup); +} + +impl ProcessExt for Process { + fn join(self, pgrp: &mut ProcessGroup) { + pgrp.add(self); + } + fn join_as_non_killable(self, pgrp: &mut ProcessGroup) { + pgrp.add(ProcessMembership::from(self).non_killable()); + } + fn join_as_daemon(self, pgrp: &mut ProcessGroup) { + pgrp.add(ProcessMembership::from(self).daemon()); + } +} + +#[cfg(test)] +mod test { + use super::*; + use super::super::*; + + use nix::sys::signal::Signal; + + fn new_process_group() -> Result { + let kill_grace_period = Duration::from_secs_f32(0.3); + ProcessGroup::with_kill_grace_period(kill_grace_period) + } + + #[test] + fn test_basic_kill() -> Result<()> { + let mut pgrp = new_process_group()?; + Command::new(&["sleep", "1000"]) + .spawn()? + .join(&mut pgrp); + // drops and kills sleep + Ok(()) + } + + #[test] + fn test_wait_success() -> Result<()> { + new_process_group()? + .add(Command::new(&["true"]).spawn()?) + .add(Command::new(&["sleep"]).arg("0.2").spawn()?) + .wait_for_success() + } + + #[test] + fn test_exit_fail() -> Result<()> { + let err_msg = new_process_group()? + .add(Command::new(&["true"]).spawn()?) + .add(Command::new(&["sleep"]).arg("1000").spawn()?) + .add(Command::new(&["false"]).spawn()?) + .wait_for_success() + .unwrap_err() + .to_string(); + + dbg!(&err_msg); + assert!(err_msg.contains("false")); + assert!(err_msg.contains("exit_code=1")); + + Ok(()) + } + + #[test] + fn test_signaled() -> Result<()> { + let cmd = Command::new(&["sleep", "1000"]).spawn()?; + cmd.kill(Signal::SIGTERM)?; + + let err_msg = new_process_group()? + .add(cmd) + .wait_for_success() + .unwrap_err() + .to_string(); + + dbg!(&err_msg); + assert!(err_msg.contains("sleep")); + assert!(err_msg.contains("caught fatal SIGTERM")); + + Ok(()) + } + + #[test] + fn test_unkillable() -> Result<()> { + let start_time = Instant::now(); + + let mut pgrp = new_process_group()?; + + Command::new(&["sleep", "1"]).spawn()? + .join_as_non_killable(&mut pgrp); + + drop(pgrp); + + assert!(start_time.elapsed().as_millis() > 1000); + + Ok(()) + } + + #[test] + fn test_daemon() -> Result<()> { + let start_time = Instant::now(); + let mut pgrp = new_process_group()?; + + Command::new(&["sleep", "1000"]).spawn()? + .join_as_daemon(&mut pgrp); + + pgrp.wait_for_success()?; + + assert!(start_time.elapsed().as_secs() < 1000); + + Ok(()) + } +} diff --git a/src/process/spawn_with_pid.rs b/src/process/spawn_with_pid.rs new file mode 100644 index 0000000..1cf1733 --- /dev/null +++ b/src/process/spawn_with_pid.rs @@ -0,0 +1,97 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::{ + time::Duration, + io::Error as IoError, + fs, +}; +use crate::consts::*; +use super::{Command, Process}; + +// At times, we wish to spawn a process with a desired PID. +// We do so when running the application from scratch. + +/// When the child fails, it can only provide an i32 errno to the parent as +/// information with the current pre_exec() from the Rust stdlib. +const BAD_PID_ERRNO: i32 = 0x0BAD_71D0; + +/// `MIN_PID` is the pid Linux gives to a process when it wraps around PID_MAX +pub const MIN_PID: i32 = 300; + +pub trait CommandPidExt { + fn spawn_with_pid(self, pid: i32) -> Result; +} + +impl CommandPidExt for Command { + /// Spawns the command with the desired PID. + /// Note: we consume self because we mutate it, and it would be unsound to + /// call `spawn()` again on it. + fn spawn_with_pid(mut self, pid: i32) -> Result { + debug_assert!(pid >= MIN_PID); + + unsafe { + self.pre_exec(move || + if std::process::id() as i32 != pid { + Err(IoError::from_raw_os_error(BAD_PID_ERRNO)) + } else { + Ok(()) + } + ); + } + + set_ns_last_pid(pid-1)?; + + self.spawn().map_err(|e| { + if let Some(e) = e.downcast_ref::() { + if e.raw_os_error() == Some(BAD_PID_ERRNO) { + return anyhow!( + "Failed to spawn process with pid={}. \ + This happens when other processes are being spawn simultaneously. \ + The `--on-app-ready` hook can be useful to run programs once safe to do.", pid); + } + } + e + }) + } +} + +pub fn set_ns_last_pid(pid: i32) -> Result<()> { + Command::new(&["set_ns_last_pid", &pid.to_string()]) + .spawn()? + .wait_for_success() +} + +pub fn spawn_set_ns_last_pid_server() -> Result { + match fs::remove_file(&*NS_LAST_PID_SOCK_PATH) { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}, + Err(e) => bail!(e), + Ok(_) => {}, + } + + let mut process = Command::new(&["set_ns_last_pid"]) + .arg(&*NS_LAST_PID_SOCK_PATH) + .spawn()?; + + while !NS_LAST_PID_SOCK_PATH.exists() { + if process.try_wait()?.is_some() { + process.wait_for_success()?; + bail!("set_ns_last_pid exited"); + } + std::thread::sleep(Duration::from_millis(100)); + } + + Ok(process) +} diff --git a/src/signal.rs b/src/signal.rs new file mode 100644 index 0000000..df8caad --- /dev/null +++ b/src/signal.rs @@ -0,0 +1,106 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::{ + sync::atomic::{AtomicBool, Ordering}, + error::Error, + fmt::Display, + result::Result as StdResult, + io::ErrorKind, +}; +use nix::errno::Errno; +use nix::sys::signal; + +lazy_static! { + static ref SIGTERM_RECEIVED: AtomicBool = AtomicBool::new(false); +} + +pub fn trap_sigterm_and_friends() -> Result<()> { + for signal in &[signal::SIGTERM, signal::SIGHUP, signal::SIGINT] { + unsafe { + // We cannot emit a log message in the signal handler as it + // would be unsafe to allocate memory. + signal_hook::register(*signal as i32, || + SIGTERM_RECEIVED.store(true, Ordering::SeqCst))?; + } + } + Ok(()) +} + +#[derive(Debug)] +pub struct TerminationRequestedError; +impl Error for TerminationRequestedError {} +impl Display for TerminationRequestedError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Termination requested") + } +} + +/// Returns an error when a SIGTERM has been received. The signal is +/// consumed, meaning that a subsequent call to `check_for_pending_sigterm()` +/// will succeed unless another SIGTERM is received. +pub fn check_for_pending_sigterm() -> Result<()> { + if SIGTERM_RECEIVED.fetch_and(false, Ordering::SeqCst) { + info!("Termination requested"); + bail!(TerminationRequestedError); + } + Ok(()) +} + +pub trait IsErrorInterrupt { + fn is_interrupt(&self) -> bool; +} + +impl IsErrorInterrupt for nix::Error { + fn is_interrupt(&self) -> bool { + match &self { + Self::Sys(errno) if *errno == Errno::EINTR => true, + _ => false + } + } +} + +impl IsErrorInterrupt for std::io::Error { + fn is_interrupt(&self) -> bool { + self.kind() == ErrorKind::Interrupted + } +} + +impl IsErrorInterrupt for anyhow::Error { + fn is_interrupt(&self) -> bool { + match self.downcast_ref::() { + Some(e) if e.is_interrupt() => return true, + _ => {} + } + + match self.downcast_ref::() { + Some(e) if e.is_interrupt() => return true, + _ => {} + } + + false + } +} + +pub fn retry_on_interrupt(mut f: impl FnMut() -> StdResult) -> StdResult + where E: IsErrorInterrupt +{ + loop { + match f() { + Err(e) if e.is_interrupt() => {} + other => return other, + } + } +} diff --git a/src/store/gs.rs b/src/store/gs.rs new file mode 100644 index 0000000..2f1e7bf --- /dev/null +++ b/src/store/gs.rs @@ -0,0 +1,67 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use url::Url; +use crate::util::UrlExt; + +// Google Cloud Storage adapter + +lazy_static! { + static ref GS_CMD: String = std::env::var("GS_CMD") + .unwrap_or_else(|_| "gsutil".to_string()); +} + +pub struct Store { + url: Url, +} + +impl Store { + pub fn new(url: Url) -> Self { + Self { url } + } +} + +impl super::Store for Store { + fn prepare(&self, _write: bool) -> Result<()> { + Ok(()) + } + + fn file(&self, filename: &str) -> Box { + Box::new(File { url: self.url.raw_join(filename) }) + } +} + +pub struct File { + url: Url, +} + +impl super::File for File { + fn upload_shell_cmd(&self) -> String { + // TODO Allow lifecycle management options to be configured + // https://cloud.google.com/storage/docs/managing-lifecycles + // XXX gsutil eats lots of memory. We should publish our GCS upload tool. + format!("{} cp - \"{}\"", *GS_CMD, self.url) + } + + + fn download_shell_cmd(&self) -> String { + format!("{} cp \"{}\" -", *GS_CMD, self.url) + } + + fn has_not_found_error(&self, stderr: &str) -> bool { + stderr.contains("Not Found") || + stderr.contains("No such object") + } +} diff --git a/src/store/local.rs b/src/store/local.rs new file mode 100644 index 0000000..ef2fa9d --- /dev/null +++ b/src/store/local.rs @@ -0,0 +1,67 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::path::PathBuf; +use url::Url; +use crate::util::create_dir_all; + +pub struct Store { + path: PathBuf, +} + +impl Store { + pub fn new(url: Url) -> Self { + Self { path: PathBuf::from(url.path()) } + } +} + +impl super::Store for Store { + fn prepare(&self, write: bool) -> Result<()> { + if write { + create_dir_all(&self.path)?; + } + + Ok(()) + } + + fn file(&self, filename: &str) -> Box { + let file_path = if filename == "/dev/null" { + PathBuf::from("/dev/null") + } else { + self.path.join(filename) + }; + + Box::new(File { path: file_path }) + } +} + +pub struct File { + path: PathBuf, +} + +impl super::File for File { + fn upload_shell_cmd(&self) -> String { + // We can unwrap() because the path is valid UTF8, as path comes from a String + format!("pv -q > \"{}\"", self.path.to_str().unwrap()) + } + + fn download_shell_cmd(&self) -> String { + format!("pv -q \"{}\"", self.path.to_str().unwrap()) + } + + fn has_not_found_error(&self, stderr: &str) -> bool { + stderr.contains("No such file or directory") + } +} diff --git a/src/store/mod.rs b/src/store/mod.rs new file mode 100644 index 0000000..af5d4c7 --- /dev/null +++ b/src/store/mod.rs @@ -0,0 +1,123 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod local; +mod s3; +mod gs; + +use anyhow::{Result, Context}; +use std::io::Write; +use url::Url; +use crate::process::{Stdio, Command}; + +// `Store` and `File` describe the API needed to store and retrieve images + +pub trait Store { + /// prepare() is called before accessing the storage. It is called: + /// * with write=true, during the FastFreeze run command + /// * with write=false, during the FastFreeze extract command + /// It is not called during the checkpoint command to speed things up. + fn prepare(&self, write: bool) -> Result<()>; + + /// Returns a File object that represents a file of name `filename`. + /// Example of file name are "manifest.json" and "XXXX-4.ffs". + fn file(&self, filename: &str) -> Box; +} + +pub trait File { + /// Returns a shell command to upload file + fn upload_shell_cmd(&self) -> String; + + /// Returns a shell command to download file + fn download_shell_cmd(&self) -> String; + + // Returns whether stderr contains a "not found error" when the download + // shell command failed. + fn has_not_found_error(&self, stderr: &str) -> bool; +} + +// write()/try_read() are helpers that use the `File` download/upload shell +// commands to download and upload content. +pub trait FileExt: File { + /// Write content to the file, truncating it if necessary. + fn write(&self, data: &[u8]) -> Result<()> { + let mut p = Command::new_shell(self.upload_shell_cmd()) + .stdin(Stdio::piped()) + .spawn()?; + + p.stdin().write_all(data) + .context("Failed to write() into the upload process")?; + p.wait_for_success() + } + + /// Reads a file. Returns None if it doesn't exist. + fn try_read(&self) -> Result>> { + let p = Command::new_shell(self.download_shell_cmd()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + let output = p.wait_with_output()?; + if output.status.success() { + Ok(Some(output.stdout)) + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + if self.has_not_found_error(&stderr) { + Ok(None) + } else { + eprint!("{}", stderr); + Err(output.ensure_success().unwrap_err()) + } + } + } +} +impl FileExt for dyn File {} + +/// Returns a store corresponding to the provided `url`. +pub fn from_url(url: &str) -> Result> { + let url = Url::parse(url)?; + + Ok(match url.scheme() { + "file" => Box::new(local::Store::new(url)), + "s3" => Box::new(s3::Store::new(url)), + "gs" => Box::new(gs::Store::new(url)), + _ => bail!("Unknown image scheme {}", url), + }) +} + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_from_url() { + assert!(from_url("file:/tmp/img").is_ok()); + assert!(from_url("file:tmp/img").is_ok()); + } + + fn test_store_read_write(store: &Box) -> Result<()> { + store.prepare(true)?; + store.file("f1.txt").write("hello".as_bytes())?; + assert_eq!(store.file("f1.txt").try_read()?, Some("hello".as_bytes().to_vec())); + assert_eq!(store.file("none.txt").try_read()?, None); + Ok(()) + } + + #[test] + fn test_read_write() -> Result<()> { + test_store_read_write(&from_url("file:/tmp/ff-test-files")?)?; + Ok(()) + } +} diff --git a/src/store/s3.rs b/src/store/s3.rs new file mode 100644 index 0000000..a47b837 --- /dev/null +++ b/src/store/s3.rs @@ -0,0 +1,71 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use url::Url; +use crate::{ + consts::*, + util::UrlExt, +}; + +// AWS S3 adapter + +lazy_static! { + static ref S3_CMD: String = std::env::var("S3_CMD") + .unwrap_or_else(|_| "aws s3".to_string()); +} + +pub struct Store { + url: Url, +} + +impl Store { + pub fn new(url: Url) -> Self { + Self { url } + } +} + +impl super::Store for Store { + fn prepare(&self, _write: bool) -> Result<()> { + Ok(()) + } + + fn file(&self, filename: &str) -> Box { + Box::new(File { url: self.url.raw_join(filename) }) + } +} + +pub struct File { + url: Url, +} + +impl super::File for File { + fn upload_shell_cmd(&self) -> String { + // TODO allow users to add an expiration date on images via an env var + // XXX aws s3 cp eats 500Mb+ of memory. That's terrible when using multiple shards. + // We'll most likely need to make our own upload tool. + + // This large expected size ensures that there are not too many multiparts pieces + let expected_size = 10*GB; + format!("{} cp --expected-size {} - \"{}\"", *S3_CMD, expected_size, self.url) + } + + fn download_shell_cmd(&self) -> String { + format!("{} cp \"{}\" -", *S3_CMD, self.url) + } + + fn has_not_found_error(&self, stderr: &str) -> bool { + stderr.contains("Not Found") + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..ae37d0c --- /dev/null +++ b/src/util.rs @@ -0,0 +1,187 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + os::unix::io::{AsRawFd, FromRawFd}, + path::{PathBuf, Path}, + env, + ffi::OsString, + fs, +}; +use nix::{ + unistd::pipe2, + fcntl::OFlag, + poll::{poll, PollFd}, + sys::uio::pwrite, +}; +use crate::{ + consts::*, + signal::{IsErrorInterrupt, retry_on_interrupt}, +}; +use serde_json::Value; +use rand::{thread_rng, Rng, distributions::Alphanumeric}; +use url::Url; + + +pub fn gen_random_alphanum_string(len: usize) -> String { + thread_rng() + .sample_iter(&Alphanumeric) + .take(len) + .collect() +} + +pub fn pwrite_all(file: &fs::File, buf: &[u8], offset: i64) -> Result<()> { + let mut buf_off = 0; + + while buf_off < buf.len() { + let file_offset = offset.checked_add(buf_off as i64).expect("File offset overflown"); + let written = retry_on_interrupt(|| + pwrite(file.as_raw_fd(), &buf[buf_off..], file_offset) + )?; + buf_off += written; + } + + Ok(()) +} + +pub fn poll_nointr(fds: &mut [PollFd], timeout: libc::c_int) -> nix::Result +{ + match poll(fds, timeout) { + Err(e) if e.is_interrupt() => Ok(0), + result => result, + } +} + +pub struct Pipe { + pub read: fs::File, + pub write: fs::File, +} + +impl Pipe { + pub fn new(flags: OFlag) -> Result { + let (fd_r, fd_w) = pipe2(flags).context("Failed to create a pipe")?; + let read = unsafe { fs::File::from_raw_fd(fd_r) }; + let write = unsafe { fs::File::from_raw_fd(fd_w) }; + Ok(Self { read, write }) + } +} + +// `strip_prefix()` is a nighly-only feature. +// We use this polyfill, until it goes into stable. +pub fn strip_prefix<'a>(s: &'a str, prefix: &str) -> Option<&'a str> { + if s.starts_with(prefix) { + Some(&s[prefix.len()..]) + } else { + None + } +} + +pub fn create_dir_all(path: impl AsRef) -> Result<()> { + fs::create_dir_all(path.as_ref()) + .with_context(|| format!("Failed to create directory {}", path.as_ref().display())) +} + +pub fn copy_file(from: impl AsRef, to: impl AsRef) -> Result { + fs::copy(from.as_ref(), to.as_ref()) + .with_context(|| format!("Failed to copy file {} to {}", + from.as_ref().display(), to.as_ref().display())) +} + +pub fn find_lib(lib_name: impl AsRef) -> Result { + // We could do a more efficient implementation, but it hurts readability, + // but we don't do it, because we like readability more. + let lib_name = lib_name.as_ref(); + let mut search_paths = vec![]; + if let Some(ld_library_paths) = env::var_os("LD_LIBRARY_PATH") { + search_paths.extend(env::split_paths(&ld_library_paths)); + } + search_paths.extend(LIB_SEARCH_PATHS.iter().map(PathBuf::from)); + + for base_path in search_paths { + let path = base_path.join(lib_name); + if path.exists() { + return Ok(path.canonicalize()?); + } + } + + bail!("Failed to find {}. Try adding its directory to LD_LIBRARY_PATH", + lib_name.display()); +} + + +pub fn atomic_symlink(from: impl AsRef, to: impl AsRef) -> Result<()> { + use std::os::unix::fs::symlink; + + // An awkward way to do `format!("{}.tmp", to)` but with OsString + let mut to_tmp = OsString::from(to.as_ref()); + to_tmp.push(".tmp"); + + symlink(from, &to_tmp)?; + fs::rename(&to_tmp, to) + .map_err(|e| { + let _ = fs::remove_file(&to_tmp); + e + })?; + + Ok(()) +} + +pub trait JsonMerge { + fn merge(self, b: Value) -> Self; +} + +impl JsonMerge for Value { + fn merge(self, b: Value) -> Self { + match (self, b) { + (Value::Object(mut a), Value::Object(b)) => { + a.extend(b); + Value::Object(a) + } + _ => panic!() + } + } +} + +pub trait UrlExt { + fn raw_join(&self, file: &str) -> Url; +} + +impl UrlExt for Url { + fn raw_join(&self, file: &str) -> Url { + // `Url` provides a join() method, but tries to be too smart + let mut url = self.clone(); + url.path_segments_mut() + .expect("URL base error") + .push(file); + url + } +} + +#[test] +fn url_join_test() -> Result<()> { + let url = Url::parse("s3://bucket_name/dir/image_name")?; + assert_eq!(url.raw_join("file").as_str(), "s3://bucket_name/dir/image_name/file"); + + let url = Url::parse("s3://bucket_name/image_name")?; + assert_eq!(url.raw_join("file").as_str(), "s3://bucket_name/image_name/file"); + + let url = Url::parse("s3://bucket_name/")?; + assert_eq!(url.raw_join("file").as_str(), "s3://bucket_name/file"); + + let url = Url::parse("s3://bucket_name")?; + assert_eq!(url.raw_join("file").as_str(), "s3://bucket_name/file"); + + Ok(()) +} diff --git a/src/virt/mod.rs b/src/virt/mod.rs new file mode 100644 index 0000000..e02c93b --- /dev/null +++ b/src/virt/mod.rs @@ -0,0 +1,161 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod time; + +use anyhow::{Result, Context}; +use std::{ + io::prelude::*, + io::{BufRead, BufReader, BufWriter, Cursor}, + os::unix::ffi::OsStrExt, + env, + fs, +}; +use crate::{ + consts::*, + util::strip_prefix, + process::{Command, Stdio, EnvVars}, +}; + +// The application needs to be virtualized in aspects: CPUID and time. +// For this, three libraries are in play: +// 1) /lib64/ld-linux-x86-64.so.2: The ELF system loader. We hijack it during the +// install command. We replace it with the libvirtcpuid loader. The loader +// provides two things: +// a) It sets up CPUID virtualization before libc's loader runs. Using +// LD_PRELOAD would be too late. +// More details can be found at https://github.com/twosigma/libvirtcpuid +// b) It provides a way to inject environement variables in any process that uses +// the ELF loader (essentially all dynamically loaded binaries). This is +// especially useful to force the LD_PRELOAD env variable to applications, even +// the one that try hard to clean up their environement. +// Note: that's why we need libvirtcpuid, even if we don't need CPUID +// virtualization. +// 2) libvirtcpuid.so: This library role is to harden the virtualization put in +// place by the hijacked ELF loader. It protects the SIGSEGV handler and is +// loaded in the application with an LD_PRELOAD directive. +// 3) libvirttime.so: This virtualizes CLOCK_MONOTONIC for the application. +// It is loaded via LD_PRELOAD. +// More details can be found at https://github.com/twosigma/libvirttime. + +fn env_for_virtualization() -> EnvVars { + let mut env: EnvVars = EnvVars::new(); + let mut ld_preloads = vec![]; + + // We always need time virtualization + ld_preloads.push(LIBVIRTTIME_PATH.clone()); + env.insert("VIRT_TIME_CONF".into(), (&*VIRT_TIME_CONF_PATH).into()); + + // But not always need CPUID virtualization + if let Some(cpuid_mask) = env::var_os("FF_APP_VIRT_CPUID_MASK") { + if !cpuid_mask.is_empty() { + ld_preloads.push(LIBVIRTCPUID_PATH.clone()); + env.insert("VIRT_CPUID_MASK".into(), cpuid_mask); + } + } + + // Users can force env variables via FF_APP_INJECT_* + for (key, value) in env::vars_os() { + // The env var key is all ASCII, it's okay to use to_string_lossy() + let key = key.to_string_lossy(); + if let Some(key) = strip_prefix(&key, "FF_APP_INJECT_") { + if key == "LD_PRELOAD" { + for path in env::split_paths(&value) { + ld_preloads.push(path); + } + } else { + env.insert(key.into(), value); + } + } + } + + // unwrap is okay here as we cannot possibly have a ":" in one of the ld_preload paths. + env.insert("LD_PRELOAD".into(), env::join_paths(ld_preloads).unwrap()); + + env +} + +/// The system ELF loader interposition loads the LD_INJECT_ENV_PATH as +/// environment variable for all application on the system. +fn inject_env_system_wide(env: &EnvVars) -> Result<()> { + || -> Result<_> { + // These env variables are forced into any program + // that do not have LD_ENV_DISABLE enabled. + let mut ld_inject_file = BufWriter::new( + fs::File::create(&*LD_INJECT_ENV_PATH)?); + + for (key, value) in env { + // format!() would be nicer, but we need to work with OsString, not String. + ld_inject_file.write_all(key.as_bytes())?; + ld_inject_file.write_all(b"=")?; + ld_inject_file.write_all(value.as_bytes())?; + ld_inject_file.write_all(b"\n")?; + } + + ld_inject_file.flush()?; + + Ok(()) + }().with_context(|| format!("Failed to create {}", LD_INJECT_ENV_PATH.display())) +} + +fn ensure_system_wide_virtualization_is_enabled() -> Result<()> { + // Check if applications are getting virtualization env injection via libvirtcpuid. + let output = || -> Result<_> { + Command::new(&["env"]) + .stdout(Stdio::piped()) + .spawn()? + .wait_with_output() + .and_then(|o| o.ensure_success().map(|_| o)) + }().context("Failed to run the `env` command")?; + + for line in BufReader::new(Cursor::new(output.stdout)).lines() { + if line.unwrap_or_default().starts_with("VIRT_TIME_CONF=") { + return Ok(()); + } + } + + bail!("Applications can escape virtualization, creating hard to diagnose problems. \ + Run `fastfreeze install` to setup virtualization. \ + A kuberbetes volume may be needed to interpose the system ELF loader"); +} + +pub fn enable_system_wide_virtualization() -> Result<()> { + let env = env_for_virtualization(); + inject_env_system_wide(&env)?; + ensure_system_wide_virtualization_is_enabled()?; + Ok(()) +} + +/// This function is called early on to disable the system wide time +/// virtualization on our process. (we need the real time) +/// It can call execve(). Note that logging is not setup yet. +pub fn disable_local_time_virtualization() -> Result<()> { + if env::var_os("VIRT_TIME_CONF").is_some() { + // We are currently executing with time virtualization enabled. This is + // a problem when we try to get the real machine clock. To avoid this, + // we re-exec ourselves with LD_ENV_DISABLE set, which prevents the + // libvirtcpuid's loader from injecting env variables into our process. + env::set_var("LD_ENV_DISABLE", "1"); + env::remove_var("VIRT_TIME_CONF"); + env::remove_var("LD_PRELOAD"); // libvirttime.so is in there, and needs to go. + + Command::new(env::args_os()) + .exec() + .context("Failed to execve() ourselves to disable time virtualization") + } else { + // We are not virtualized, but our children should be. + env::remove_var("LD_ENV_DISABLE"); + Ok(()) + } +} diff --git a/src/virt/time.rs b/src/virt/time.rs new file mode 100644 index 0000000..7b5261e --- /dev/null +++ b/src/virt/time.rs @@ -0,0 +1,407 @@ +// Copyright 2020 Two Sigma Investments, LP. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, Context}; +use std::{ + mem::{size_of, MaybeUninit}, + os::unix::io::AsRawFd, + os::unix::fs::PermissionsExt, + path::Path, + io::prelude::*, + io::SeekFrom, + slice, + fs, +}; +use nix::unistd::{lseek, Whence}; +#[cfg(not(test))] +use nix::{Error, errno::Errno}; +use libc::timespec; +use crate::{ + consts::*, + util::pwrite_all, +}; + +// This file contains logic to configure libvirttime. In a nutshell, libvirttime +// is used to virtualize the CLOCK_MONOTONIC values for the application. The +// library is configured via an external file that contains all the clock time +// offsets to be applied. +// +// The config file has the following format: +// static struct virt_time_config { +// struct timespec ts_offset; +// struct timespec per_thread_ts[PID_MAX]; +// }; +// +// There is a global time offset, and a per thread time offset. All must be +// adjusted when migrating an app from a machine to another. +// +// More details can be found at https://github.com/twosigma/libvirttime + + +// `PID_MAX` is defined in the kernel in include/linux/threads.h +// We don't read /proc/sys/kernel/pid_max because it can vary +// from machine to machine. +const PID_MAX: u32 = 4_194_304; +const NSEC_IN_SEC: Nanos = 1_000_000_000; + +/// File position of virt_time_config.thread_confs[0] +const PID_0_FPOS: i64 = size_of::() as i64; +/// sizeof(struct per_thread_conf) +const PROCESS_AREA_SIZE: usize = size_of::(); + +/// We represent a `timespec` with the nanosecs as a i128. It's easier to do +/// computation with. `Duration` is not suitable for us as it lack support +/// underflowing substractions. +pub type Nanos = i128; + +#[cfg(not(test))] +fn clock_gettime_monotonic() -> Nanos { + let result = unsafe { + let mut ts = MaybeUninit::::uninit(); + if libc::clock_gettime(libc::CLOCK_MONOTONIC, ts.as_mut_ptr()) == 0 { + Ok(Nanos::from_timespec(ts.assume_init())) + } else { + Err(Error::Sys(Errno::last())) + } + }; + + result.expect("clock_gettime() failed") +} + +#[cfg(test)] +fn clock_gettime_monotonic() -> Nanos { + test::clock_gettime_mock() +} + +trait NanosExt { + fn to_timespec(self) -> timespec; + fn from_timespec(ts: timespec) -> Self; +} + +impl NanosExt for Nanos { + fn to_timespec(self) -> timespec { + let mut ts = timespec { + tv_sec: (self / NSEC_IN_SEC) as i64, + tv_nsec: (self % NSEC_IN_SEC) as i64, + }; + + // nsec should always be positive as the libvirttime code assumes nsec is between 0 and + // NSEC_IN_SEC-1. See https://github.com/twosigma/libvirttime/blob/master/src/util.h#L48 + if ts.tv_nsec < 0 { + ts.tv_sec -= 1; + ts.tv_nsec += NSEC_IN_SEC as i64; + } + + ts + } + + fn from_timespec(ts: timespec) -> Self { + ts.tv_sec as i128 * NSEC_IN_SEC + ts.tv_nsec as i128 + } +} + +fn read_timespec(reader: &mut R) -> Result { + unsafe { + let mut ts = MaybeUninit::::uninit(); + let mut buf = slice::from_raw_parts_mut( + ts.as_mut_ptr() as *mut u8, + size_of::() + ); + reader.read_exact(&mut buf) + .context("Failed to read from the time config file")?; + + Ok(Nanos::from_timespec(ts.assume_init())) + } +} + +fn write_timespec_at(file: &fs::File, nanos: Nanos, fpos: i64) -> Result<()> { + unsafe { + let ts = nanos.to_timespec(); + let buf = slice::from_raw_parts( + &ts as *const timespec as *const u8, + size_of::() + ); + pwrite_all(file, &buf, fpos) + .context("Failed to write to the time config file")?; + Ok(()) + } +} + +pub struct ConfigPath<'a> { + path: &'a Path, +} + +impl<'a> ConfigPath<'a> { + pub fn new>(path: &'a S) -> Self { + // We don't open the config file at this point. Depending on the + // operation, we might create, open_read, or open_write the file. + Self { path: path.as_ref() } + } + + /// Returns the current configured time offset + fn read_configured_offset(&self) -> Result { + let mut config_file = fs::File::open(&self.path) + .with_context(|| format!("Failed to open {}. \ + It is normally created when running the application for the \ + first time via the 'run' command", self.path.display()))?; + let ts_offset = read_timespec(&mut config_file)?; + Ok(ts_offset) + } + + /// Returns the offset to write in the time config file so that if the + /// application were to call `clock_gettime(CLOCK_MONOTONIC)` immediately, it + /// would get `app_clock`. + fn config_time_offset(app_clock: Nanos) -> Nanos { + let machine_clock = clock_gettime_monotonic(); + machine_clock - app_clock + } + + /// `read_current_app_clock()` returns the same result as what the application, + /// virtualized with libvirttime, would get if it were to call + /// `clock_gettime(CLOCK_MONOTONIC)`. + pub fn read_current_app_clock(&self) -> Result { + let config_offset = self.read_configured_offset()?; + let machine_clock = clock_gettime_monotonic(); + let app_clock = machine_clock - config_offset; + Ok(app_clock) + } + + pub fn write_intial(&self) -> Result<()> { + || -> Result<_> { + // We arbitrarily start the app clock at 0. + let app_clock = 0; + + // The time config file must be writable by all users as we are + // applying a system-wide virtualization configuration. + let mut config_file = fs::File::create(&self.path) + .with_context(|| format!("Failed to create {}", self.path.display()))?; + + // We `set_permissions()` after `create()` because our umask may get in the way of + // the flags we specify in create(). We don't want to change our umask as it is a + // process-wide setting, and not thread local. So it would be unsafe to restore the + // previous umask. + fs::set_permissions(self.path, fs::Permissions::from_mode(0o777)) + .with_context(|| format!("Failed to chmod {}", self.path.display()))?; + + // The config_file has the layout of the `struct virt_time_config` + write_timespec_at(&config_file, Self::config_time_offset(app_clock), 0)?; + + // Write a 0 at the end of the file to make it the right size + // without using much space. We add a page to avoid making the hole + // ends too early. + config_file.seek(SeekFrom::Current( + Self::pid_to_fpos(PID_MAX+1) + PAGE_SIZE as i64))?; + config_file.write_all(&[0])?; + + Ok(()) + }().with_context(|| format!("Failed to write to {}", self.path.display())) + } + + /// PID to file position in the config file + fn pid_to_fpos(pid: u32) -> i64 { + PID_0_FPOS + (pid as i64)*(PROCESS_AREA_SIZE as i64) + } + + /// file position to PID (rounded down) + fn fpos_to_pid(fpos: i64) -> u32 { + ((fpos - PID_0_FPOS)/(PROCESS_AREA_SIZE as i64)) as u32 + } + + /// Rewrite time offsets with the desired `app_clock` + pub fn adjust_timespecs(&self, app_clock: Nanos) -> Result<()> { + || -> Result<_> { + let mut config_file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(&self.path)?; + + let new_time_offset = Self::config_time_offset(app_clock); + let old_time_offset = read_timespec(&mut config_file)?; + let old_to_new_time_offset = new_time_offset - old_time_offset; + + // Adjust the global timespec offset + write_timespec_at(&config_file, new_time_offset, 0)?; + + let mut pid: u32 = 1; // pid=0 does not exist + + // Adjust the threads timespec offsets + loop { + // With SEEK_DATA, we'll be skipping pages that have no pids. + // It seeks to the earlist file position that has data. Typically, + // we'll be hitting a page boundary. + let fpos = lseek(config_file.as_raw_fd(), Self::pid_to_fpos(pid), + Whence::SeekData)?; + + // Note: performance could be better as we are doing two + // syscalls (read+write) per pid. We could improve this to only + // do two syscalls per page. But that's for another time. + + // Compute the pid corresponding to the file position + pid = Self::fpos_to_pid(fpos); + if pid > PID_MAX { + break; + } + + // `fpos_to_pid()` rounds down. If the returned `fpos` does not + // correspond to the file position of the `pid`, the file + // position is at a data page boundary. We can skip that pid as + // we are sure that pid is unused. + // + // |pid ......|pid+1 ......| + // ... hole >|< data ... + // ^ + // \ file_offset + // + if fpos == Self::pid_to_fpos(pid) { + // Read the current timespec, adjust it, and write it back + let mut offset = read_timespec(&mut config_file)?; + offset += old_to_new_time_offset; + write_timespec_at(&mut config_file, offset, fpos)?; + } + + pid += 1; + } + + Ok(()) + }().with_context(|| format!( + "Failed to adjust timespecs in {}", self.path.display())) + } +} + +impl<'a> Default for ConfigPath<'a> { + fn default() -> Self { + Self::new(&*VIRT_TIME_CONF_PATH) + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::sync::Mutex; + + lazy_static! { + static ref MACHINE_CLOCK: Mutex = Mutex::new(-1); + } + + pub fn clock_gettime_mock() -> Nanos { + *MACHINE_CLOCK.lock().unwrap() + } + + #[test] + fn test() -> Result<()> { + let config_path = Path::new("/tmp/ff-test-time-conf"); + let _ = std::fs::remove_file(&config_path); + let config = ConfigPath::new(&config_path); + + fn read_pid_ts(config_file: &mut fs::File, pid: u32) -> Result { + config_file.seek(SeekFrom::Start(ConfigPath::pid_to_fpos(pid) as u64))?; + read_timespec(config_file) + } + + assert!(config.read_configured_offset().is_err()); + + // Clock offset is set to 100, app clock is 0. + let mut machine_clock = NSEC_IN_SEC + 100; + let mut app_clock = 0; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + + config.write_intial()?; + let mut config_file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(&config.path)?; + + assert_eq!(config.read_configured_offset()?, machine_clock); + assert_eq!(config.read_current_app_clock()?, 0); + + // Clock advances by 1000, so app_clock should be 1000 + machine_clock += 1000; + app_clock += 1000; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + assert_eq!(config.read_current_app_clock()?, app_clock); + + write_timespec_at(&config_file, machine_clock + 100, ConfigPath::pid_to_fpos(1))?; + write_timespec_at(&config_file, machine_clock + 101, ConfigPath::pid_to_fpos(10000))?; + write_timespec_at(&config_file, machine_clock + 102, ConfigPath::pid_to_fpos(20000))?; + write_timespec_at(&config_file, machine_clock + 103, ConfigPath::pid_to_fpos(20001))?; + write_timespec_at(&config_file, machine_clock + 104, ConfigPath::pid_to_fpos(PID_MAX))?; + + assert_eq!(machine_clock + 100, read_pid_ts(&mut config_file, 1)?); + assert_eq!(machine_clock + 101, read_pid_ts(&mut config_file, 10000)?); + assert_eq!(machine_clock + 102, read_pid_ts(&mut config_file, 20000)?); + assert_eq!(machine_clock + 103, read_pid_ts(&mut config_file, 20001)?); + assert_eq!(machine_clock + 104, read_pid_ts(&mut config_file, PID_MAX)?); + + // Now let's pretend we checkpoint and move to another machine. + // app clock is still 1000, but we land on a machine whose clock with a clock in the future + machine_clock = 10*NSEC_IN_SEC + 100; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + + config.adjust_timespecs(app_clock)?; // the app clock we want + assert_eq!(config.read_current_app_clock()?, app_clock); + + assert_eq!(machine_clock + 100, read_pid_ts(&mut config_file, 1)?); + assert_eq!(machine_clock + 101, read_pid_ts(&mut config_file, 10000)?); + assert_eq!(machine_clock + 102, read_pid_ts(&mut config_file, 20000)?); + assert_eq!(machine_clock + 103, read_pid_ts(&mut config_file, 20001)?); + assert_eq!(machine_clock + 104, read_pid_ts(&mut config_file, PID_MAX)?); + assert_eq!(0, read_pid_ts(&mut config_file, 100000)?); // should be not touched + + // What if we go on a machine which time is earlier than ours. This + // will test overflowing substractions. + machine_clock = 100; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + + config.adjust_timespecs(app_clock)?; // the app clock we want + assert_eq!(config.read_current_app_clock()?, app_clock); + + assert_eq!(machine_clock + 100, read_pid_ts(&mut config_file, 1)?); + assert_eq!(machine_clock + 101, read_pid_ts(&mut config_file, 10000)?); + assert_eq!(machine_clock + 102, read_pid_ts(&mut config_file, 20000)?); + assert_eq!(machine_clock + 103, read_pid_ts(&mut config_file, 20001)?); + assert_eq!(machine_clock + 104, read_pid_ts(&mut config_file, PID_MAX)?); + assert_eq!(0, read_pid_ts(&mut config_file, 100000)?); + + // Time passes + machine_clock += 500; + app_clock += 500; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + + // App do some calls that use the clock + write_timespec_at(&config_file, machine_clock + 100, ConfigPath::pid_to_fpos(1))?; + write_timespec_at(&config_file, machine_clock + 101, ConfigPath::pid_to_fpos(10000))?; + write_timespec_at(&config_file, machine_clock + 102, ConfigPath::pid_to_fpos(20000))?; + write_timespec_at(&config_file, machine_clock + 103, ConfigPath::pid_to_fpos(20001))?; + write_timespec_at(&config_file, machine_clock + 104, ConfigPath::pid_to_fpos(PID_MAX))?; + + // We checkpoint + assert_eq!(config.read_current_app_clock()?, app_clock); + + // And restore an another machine + machine_clock = 77; + *MACHINE_CLOCK.lock().unwrap() = machine_clock; + + config.adjust_timespecs(app_clock)?; // the app clock we want + assert_eq!(config.read_current_app_clock()?, app_clock); + + assert_eq!(machine_clock + 100, read_pid_ts(&mut config_file, 1)?); + assert_eq!(machine_clock + 101, read_pid_ts(&mut config_file, 10000)?); + assert_eq!(machine_clock + 102, read_pid_ts(&mut config_file, 20000)?); + assert_eq!(machine_clock + 103, read_pid_ts(&mut config_file, 20001)?); + assert_eq!(machine_clock + 104, read_pid_ts(&mut config_file, PID_MAX)?); + assert_eq!(0, read_pid_ts(&mut config_file, 100000)?); + + Ok(()) + } +}