Add performance measurement scripts.

ynikitenko · Nov 14, 2021 · bb1770a · bb1770a
1 parent f0b9c5b
commit bb1770a
Show file tree

Hide file tree

Showing 8 changed files with 266 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,8 @@
 
 # output and executables
 docs/examples/tutorial/*/output
+# add new files from there manually
+docs/examples/performance/*
 
 # Sphinx
 docs/build

diff --git a/docs/examples/performance/compute_perf.sh b/docs/examples/performance/compute_perf.sh
@@ -0,0 +1,16 @@
+{
+    echo \# Lena commit: `git log --pretty=format:'%h' -n 1`, `python -V`
+    # It's meaningful to not produce plots (because we don't measure pdflatex performance).
+    # But it's safe to "produce" them in code if they exist: they won't be reproduced!
+    # Disk cache seems not relevant, because the times don't change much between runs
+    # (and time of pure read is very very small)
+    #
+    echo \# one histogram:
+    /bin/time python lena_xs.py 2>&1 >/dev/null
+    echo \# Split, two histograms:
+    /bin/time python lena_xy.py 2>&1 >/dev/null
+} >> performance.txt
+# feel free to add meaningful comments to performance.txt!
+#
+# anonymous function trick taken from https://stackoverflow.com/a/315113/952234
+# help with redirection from https://stackoverflow.com/a/549776/952234
diff --git a/docs/examples/performance/gen_data.sh b/docs/examples/performance/gen_data.sh
@@ -0,0 +1,2 @@
+cd ../tutorial/generate_data && python generate_normal.py --large
+# about 3 seconds
diff --git a/docs/examples/performance/histogram_1d.tex b/docs/examples/performance/histogram_1d.tex
@@ -0,0 +1,19 @@
+\documentclass{standalone}
+\usepackage{tikz}
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.18}
+
+\begin{document}
+\BLOCK{ set var = variable if variable else '' }
+\begin{tikzpicture}
+\begin{axis}[
+    \BLOCK{ if var.latex_name }
+    xlabel = {$\VAR{ var.latex_name }$ \BLOCK{ if var.unit }[$\mathrm{\VAR{ var.unit }}$]\BLOCK{ endif }},
+    \BLOCK{ endif }
+]
+\addplot [
+    const plot,
+] table [col sep=comma, header=false] {\VAR{ output.filepath }};
+\end{axis}
+\end{tikzpicture}
+\end{document}
diff --git a/docs/examples/performance/histogram_1d_simple.tex b/docs/examples/performance/histogram_1d_simple.tex
@@ -0,0 +1,14 @@
+\documentclass{standalone}
+\usepackage{tikz}
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.18}
+
+\begin{document}
+\begin{tikzpicture}
+\begin{axis}[]
+\addplot [
+    const plot,
+] table [col sep=comma, header=false] {\VAR{ output.filepath }};
+\end{axis}
+\end{tikzpicture}
+\end{document}
diff --git a/docs/examples/performance/lena_xs.py b/docs/examples/performance/lena_xs.py
@@ -0,0 +1,85 @@
+import os
+import sys
+
+import lena.math
+from lena.core import Sequence, Source
+from lena.flow import Print, Cache, Slice
+from lena.context import Context, UpdateContext
+from lena.output import Write, ToCSV, RenderLaTeX, LaTeXToPDF, PDFToPNG
+from lena.structures import Histogram
+
+
+data_path = os.path.join("..", "tutorial", "data")
+data_file = os.path.join(data_path, "normal_3d_large.csv")
+
+
+def get_filenames():
+    filenames = [data_file]
+    for filename in filenames:
+        yield filename
+
+
+class GetCoordinates():
+    """Read coordinates from CSV files."""
+    def __init__(self):
+        pass
+
+    def run(self, flow):
+        for file_ in flow:
+            with open(file_) as fil:
+                for line in fil:
+                    yield [float(coord) for coord in line.split(',')]
+
+## note that there is no coupling between these functions
+
+
+def main():
+    filenames = get_filenames()
+    write = Write("output")
+
+    s = Sequence(
+        GetCoordinates(),
+        lambda coord: coord[0],  # x
+        Histogram(lena.math.mesh((-10, 10), 100)),
+        UpdateContext("output.filename", "x"),
+        # output
+        ToCSV(),
+        write,
+        RenderLaTeX("histogram_1d_simple.tex"),
+        write,
+        LaTeXToPDF(),
+        PDFToPNG(),
+    )
+
+    return s.run(filenames)
+    # return s()
+    # /bin/time with already produced plot:
+    # 3.26user 0.00system 0:03.27elapsed 99%CPU (0avgtext+0avgdata 18000maxresident)k
+    # 0inputs+0outputs (0major+2735minor)pagefaults 0swaps
+    # PyPy:
+    # 1.11user 0.03system 0:01.21elapsed 93%CPU (0avgtext+0avgdata 92996maxresident)k
+    # 12168inputs+736outputs (59major+12457minor)pagefaults 0swaps
+
+
+def read_data(file_):
+    """Read lines of a file, used only for performance measurement."""
+    with open(file_) as fil:
+        for line in fil:
+            coord_ = [float(coord) for coord in line.split(',')]
+
+
+if __name__ == "__main__":
+    if "read_data" in sys.argv:
+        read_data(data_file)
+        sys.exit(0)
+        # without split and float():
+        # Python results (PyPy slower):
+        # 0.13user 0.01system 0:00.14elapsed 99%CPU (0avgtext+0avgdata 17840maxresident)k
+        # 0inputs+0outputs (0major+2709minor)pagefaults 0swaps
+        # with split and float():
+        # Python results (PyPy similar):
+        # 0.81user 0.02system 0:00.84elapsed 99%CPU (0avgtext+0avgdata 18084maxresident)k
+        # 0inputs+0outputs (0major+2713minor)pagefaults 0swaps
+
+    for result in main():
+        print(result)
diff --git a/docs/examples/performance/lena_xy.py b/docs/examples/performance/lena_xy.py
@@ -0,0 +1,56 @@
+import os
+import sys
+
+import lena.math
+from lena.core import Sequence, Source, Split
+from lena.variables import Variable, Combine
+from lena.flow import Print, Cache, Slice
+from lena.context import Context, UpdateContext
+from lena.output import (
+    Write, ToCSV, RenderLaTeX, LaTeXToPDF, PDFToPNG, MakeFilename
+)
+from lena.structures import Histogram
+
+from lena_xs import get_filenames, GetCoordinates, data_file
+
+
+def main():
+    write = Write("output")
+
+    s = Source(
+        get_filenames,
+        GetCoordinates(),
+        Split([
+            (
+                Variable("x", lambda coord: coord[0]),
+                Histogram(lena.math.mesh((-10, 10), 100)),
+            ),
+            (
+                Variable("y", lambda coord: coord[1],
+                         latex_name="y", unit="mm"),
+                Histogram(lena.math.mesh((-10, 10), 100)),
+            ),
+        ]),
+        MakeFilename("{{variable.name}}"),
+        # UpdateContext("output.filename", "x"),
+        ToCSV(),
+        write,
+        RenderLaTeX("histogram_1d.tex"),
+        write,
+        LaTeXToPDF(),
+        PDFToPNG(),
+    )
+
+    return s()
+
+
+if __name__ == "__main__":
+    for result in main():
+        print(result)
+
+    # Python:
+    # 17.09user 0.00system 0:17.10elapsed 99%CPU (0avgtext+0avgdata 18540maxresident)k
+    # 0inputs+0outputs (0major+2851minor)pagefaults 0swaps
+    # PyPy:
+    # 6.34user 0.06system 0:06.45elapsed 99%CPU (0avgtext+0avgdata 93756maxresident)k
+    # 0inputs+0outputs (0major+12357minor)pagefaults 0swaps
diff --git a/docs/examples/performance/numpy_xs.py b/docs/examples/performance/numpy_xs.py
@@ -0,0 +1,72 @@
+import os
+import numpy
+from matplotlib import pyplot as plt
+
+
+def plot_data():
+    data_path = os.path.join("..", "tutorial", "data")
+    data_file = os.path.join(data_path, "normal_3d_large.csv")
+    use_np_load = False
+
+    if use_np_load:
+        # The recommended way of plotting data from a file is ...
+        # numpy.loadtxt or pandas.read_csv to read the data.
+        # These are more powerful and faster.
+        # https://matplotlib.org/3.2.2/gallery/misc/plotfile_demo_sgskip.html
+        data = numpy.loadtxt(data_file, delimiter=',', usecols=0)
+        # /bin/time, without savefig
+        # 2.94user 0.30system 0:03.00elapsed 107%CPU (0avgtext+0avgdata 117196maxresident)k
+        # 0inputs+0outputs (0major+19994minor)pagefaults 0swaps
+    else:
+        filenames = [data_file]
+        data = []
+        for filename in filenames:
+            with open(filename) as fil:
+                for line in fil:
+                    data.append(float(line.split(',')[0]))
+        # /bin/time, without savefig
+        # 3.11user 0.33system 0:03.17elapsed 108%CPU (0avgtext+0avgdata 437396maxresident)k
+        # 0inputs+0outputs (0major+97214minor)pagefaults 0swaps
+
+    plt.hist(data, bins=100, range=[-10, 10])
+    # plt.savefig(os.path.join("output", "pyplot_xs.png"))
+
+
+if __name__ == "__main__":
+    plot_data()
+
+
+## Educational notes. Examples were not tested and are not used.
+
+# "Top-down"
+
+def td_read_files(filenames, data):
+    for filename in filenames:
+        with open(filename) as fil:
+            for line in fil:
+                data.append(td_read_line(line))
+
+
+def td_read_line(line):
+    """Return x column"""
+    return float(line.split(',')[0])
+
+
+# "Bottom-up"
+
+def get_filenames():
+    filenames = []
+    for filename in filenames:
+        yield filename
+
+def bu_read_lines():
+    # or get data as an argument
+    data = []
+    for filename in get_filenames():
+        with open(filename) as fil:
+            for line in fil:
+                data.append(float(line.split(',')[0]))
+    return data
+
+
+## note the coupling between these functions