-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearchindex.js
1 lines (1 loc) · 54.4 KB
/
searchindex.js
1
Search.setIndex({"docnames": ["chapter_auto_program_optimization/index", "chapter_end_to_end/e2e-assignment", "chapter_end_to_end/index", "chapter_gpu_acceleration/index", "chapter_gpu_acceleration/part1", "chapter_gpu_acceleration/part2", "chapter_graph_optimization/index", "chapter_integration/index", "chapter_introduction/index", "chapter_tensor_program/case_study", "chapter_tensor_program/index", "chapter_tensor_program/tensor_program", "chapter_tensor_program/tensorir_exercises", "index"], "filenames": ["chapter_auto_program_optimization/index.rst", "chapter_end_to_end/e2e-assignment.rst", "chapter_end_to_end/index.rst", "chapter_gpu_acceleration/index.rst", "chapter_gpu_acceleration/part1.rst", "chapter_gpu_acceleration/part2.rst", "chapter_graph_optimization/index.rst", "chapter_integration/index.rst", "chapter_introduction/index.rst", "chapter_tensor_program/case_study.rst", "chapter_tensor_program/index.rst", "chapter_tensor_program/tensor_program.rst", "chapter_tensor_program/tensorir_exercises.rst", "index.rst"], "titles": ["<span class=\"section-number\">4. </span>Automatic Program Optimization", "MLC Assignment 1: End-to-End Model Execution", "<span class=\"section-number\">3. </span>End to End Model Execution", "<span class=\"section-number\">6. </span>GPU and Hardware Acceleration", "<span class=\"section-number\">6.1. </span>Part 1", "<span class=\"section-number\">6.2. </span>Part 2", "<span class=\"section-number\">7. </span>Computational Graph Optimization", "<span class=\"section-number\">5. </span>Integration with Machine Learning Frameworks", "<span class=\"section-number\">1. </span>Introduction", "<span class=\"section-number\">2.4. </span>TensorIR: Tensor Program Abstraction Case Study", "<span class=\"section-number\">2. </span>Tensor Program Abstraction", "<span class=\"section-number\">2.1. </span>Primitive Tensor Function", "<span class=\"section-number\">2.5. </span>Exercises for TensorIR", "Machine Learning Compiler"], "terms": {"In": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "past": [0, 4, 5, 6, 7, 9], "chapter": [0, 2, 4, 5, 6, 7, 9, 10], "we": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "learn": [0, 1, 2, 4, 5, 8, 10, 11, 12], "about": [0, 1, 2, 5, 6, 7, 8, 9, 11], "how": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], "build": [0, 1, 5, 8, 10, 13], "connect": [0, 1, 2, 7, 8], "them": [0, 1, 2, 4, 6, 7, 8, 9], "form": [0, 2, 5, 7, 8, 9, 11, 13], "There": [0, 2, 7, 8, 9], "ar": [0, 2, 4, 5, 6, 7, 8, 9, 11, 12], "three": [0, 4, 9, 12], "primari": [0, 9], "type": [0, 2, 6, 7, 9, 11], "abstract": [0, 1, 2, 6, 7, 12, 13], "have": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "us": [0, 2, 4, 5, 6, 8, 11, 12], "so": [0, 2, 4, 5, 7, 9, 11], "far": [0, 2, 4, 5, 7, 9], "A": [0, 1, 2, 4, 6, 7, 8, 9, 11, 12], "comput": [0, 1, 4, 7, 8, 10, 11, 12, 13], "graph": [0, 1, 7, 8, 13], "view": [0, 4, 6, 7, 8, 9, 11], "drive": [0, 8, 11], "high": [0, 2, 6, 8, 9, 12, 13], "level": [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13], "librari": [0, 6, 7, 8, 9, 11, 13], "call": [0, 1, 2, 4, 5, 7, 8, 9, 11, 13], "via": [0, 1, 2, 7], "environ": [0, 4, 5, 6, 8, 9, 11, 13], "registr": [0, 2], "all": [0, 1, 2, 4, 5, 7, 8, 9, 11], "element": [0, 2, 4, 7, 9, 11, 13], "encapsul": [0, 9], "an": [0, 1, 4, 5, 6, 8, 9, 11, 12, 13], "irmodul": [0, 1, 4, 5, 6, 9, 12, 13], "most": [0, 2, 5, 6, 7, 8, 9, 11], "mlc": [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "process": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "can": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "among": [0, 2, 6, 7, 9, 11], "mani": [0, 2, 4, 5, 7, 8, 9, 11], "differ": [0, 2, 4, 5, 7, 8, 9, 11, 12, 13], "wai": [0, 1, 2, 4, 5, 7, 8, 10], "same": [0, 2, 4, 6, 8, 9, 11, 12], "thi": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "autom": [0, 8, 9], "some": [0, 1, 2, 4, 5, 7, 8, 9, 11, 12], "To": [0, 1, 2, 4, 5, 6, 7, 9], "begin": [0, 1, 2, 4, 5, 6, 7, 8, 9], "import": [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13], "necessari": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11], "depend": [0, 2, 4, 5, 6, 7, 8, 11, 13], "creat": [0, 1, 2, 4, 10], "helper": [0, 2, 7, 9], "numpi": [0, 1, 2, 4, 5, 6, 7, 9, 11, 12], "np": [0, 1, 2, 4, 5, 6, 7, 9, 12], "tvm": [0, 1, 2, 4, 5, 6, 7, 9, 12], "from": [0, 2, 4, 5, 6, 8, 9, 11, 12, 13], "relax": [0, 1, 2, 4, 5, 6, 7], "ir": [0, 2, 4, 5, 6, 7, 9, 12], "modul": [0, 2, 4, 5, 6, 7, 9, 12], "script": [0, 1, 2, 4, 5, 6, 7, 9, 12], "r": [0, 1, 2, 4, 5, 6, 7], "tir": [0, 1, 2, 4, 5, 6, 7, 9, 12], "t": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "ipython": [0, 1, 2, 9, 12], "def": [0, 1, 2, 4, 5, 6, 7, 9, 12], "code2html": 0, "code": [0, 1, 6, 7, 8, 11, 12, 13], "pygment": 0, "turn": [0, 1, 2, 6, 9], "string": [0, 2], "highlight": [0, 9], "html": 0, "formatt": 0, "htmlformatt": 0, "lexer": 0, "python3lex": 0, "return": [0, 1, 2, 4, 5, 6, 7, 9], "style": [0, 2], "": [0, 1, 2, 4, 6, 7, 8, 9, 12], "n": [0, 1, 2, 7, 12], "get_style_def": 0, "let": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12], "u": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11], "review": [0, 2, 4, 7, 8, 9], "what": [0, 1, 2, 4, 9, 12, 13], "did": [0, 4, 7], "our": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13], "previou": [0, 2, 8], "singl": [0, 1, 5, 6, 7, 8, 9, 10, 11], "ir_modul": [0, 1, 2, 4, 5, 6, 7, 9, 12], "class": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12], "mymodul": [0, 2, 6, 7, 9], "prim_func": [0, 1, 2, 4, 5, 6, 7, 9, 12], "main": [0, 1, 2, 4, 5, 6, 7, 9], "buffer": [0, 1, 4, 5, 6, 7, 11, 12], "128": [0, 1, 2, 4, 6, 7, 9, 11, 12], "float32": [0, 1, 2, 4, 5, 6, 7, 9, 12], "b": [0, 1, 2, 4, 5, 6, 7, 9, 12], "c": [0, 1, 2, 4, 5, 7, 9, 11, 12], "func_attr": [0, 1, 4, 5, 6, 7, 9, 12], "global_symbol": [0, 1, 4, 5, 9, 12], "noalia": [0, 1, 4, 5, 6, 7, 9, 12], "true": [0, 1, 2, 4, 5, 6, 7, 9, 12], "i": [0, 1, 2, 4, 5, 6, 7, 9, 11, 12, 13], "j": [0, 1, 2, 4, 5, 7, 9, 12], "k": [0, 2, 4, 5, 6, 7, 9, 12], "grid": [0, 1, 2, 4, 5, 6, 7, 9, 12], "block": [0, 1, 3, 6, 12], "vi": [0, 1, 2, 4, 5, 9, 11, 12], "vj": [0, 1, 2, 4, 5, 9, 12], "vk": [0, 2, 4, 5, 9], "axi": [0, 1, 2, 4, 5, 6, 7, 11, 12], "remap": [0, 1, 2, 4, 5, 6, 7, 9, 12], "ssr": [0, 2, 4, 5, 6, 7, 9], "init": [0, 2, 4, 5, 6, 7, 9, 12], "0": [0, 1, 2, 4, 5, 6, 7, 9, 11, 12], "first": [0, 1, 4, 5, 6, 7, 8, 9, 12, 13], "defin": [0, 1, 2, 5, 6, 7, 8, 9], "set": [0, 1, 2, 4, 5, 6, 8, 9, 11], "input": [0, 1, 2, 4, 5, 7, 8, 9, 11, 12], "output": [0, 1, 2, 5, 6, 7, 8, 9, 12], "evalu": [0, 4, 5], "dtype": [0, 1, 2, 4, 5, 6, 7, 9, 12], "a_np": [0, 4, 5, 9], "random": [0, 4, 5, 9, 12], "rand": [0, 5, 9, 12], "astyp": [0, 4, 5, 9, 12], "b_np": [0, 4, 5, 9], "c_mm": 0, "run": [0, 5, 7, 8, 10, 12, 13], "follow": [0, 1, 2, 4, 5, 6, 7, 9, 11, 12], "a_nd": [0, 4, 5, 9], "nd": [0, 1, 2, 4, 5, 6, 7, 9, 12], "arrai": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12], "b_nd": [0, 4, 5, 9], "c_nd": [0, 4, 5, 9], "empti": [0, 1, 2, 5, 9, 12], "lib": [0, 5], "target": [0, 1, 2, 4, 5, 6, 7, 9, 12], "llvm": [0, 1, 2, 5, 6, 7, 9, 12], "f_timer_befor": [0, 9], "time_evalu": [0, 4, 9, 12], "cpu": [0, 1, 2, 4, 5, 6, 7, 9, 12], "print": [0, 1, 2, 4, 6, 7, 9, 12], "time": [0, 1, 2, 4, 5, 9, 12], "cost": [0, 8, 9], "3f": 0, "m": [0, 1, 2, 4, 7, 9], "mean": [0, 2, 4, 9], "1000": 0, "3": [0, 2, 4, 6, 7], "150": 0, "next": [0, 2, 6, 7, 9], "bit": [0, 2], "reorgan": [0, 5], "loop": [0, 1, 2, 4, 6, 8, 11, 12], "access": [0, 1, 4, 5, 9, 12], "pattern": [0, 13], "schedule_mm": 0, "sch": [0, 1, 4, 5, 9, 12], "jfactor": [0, 9], "4": [0, 4, 5, 6, 9, 11, 12], "block_c": [0, 4, 9], "get_block": [0, 1, 4, 5, 9, 12], "get_loop": [0, 1, 4, 5, 9, 12], "j_0": [0, 4, 5, 9], "j_1": [0, 4, 5, 9], "split": [0, 1, 4, 5, 7, 9, 11, 12], "factor": [0, 4, 5, 9, 12], "none": [0, 1, 4, 5, 6, 7, 9, 12], "reorder": [0, 2, 4, 5, 9, 11, 12], "decompose_reduct": [0, 4, 5, 9, 12], "displai": [0, 1, 2, 9, 12], "mod": [0, 1, 4, 5, 6, 9, 12], "quot": [0, 2, 4, 5, 6, 7, 9, 12], "bool": [0, 4, 5, 6, 7, 9], "root": [0, 1, 2, 4, 5, 6, 7, 9, 12], "32": [0, 1, 9, 12], "j_1_init": 0, "rang": [0, 2, 4, 5, 7, 9, 12], "c_init": [0, 4], "spatial": [0, 1, 4, 5, 7, 9, 11, 12], "read": [0, 1, 2, 4, 5, 6, 7, 9, 12], "write": [0, 1, 2, 4, 5, 6, 7, 9, 10], "c_updat": [0, 4], "reduc": [0, 1, 4, 5, 8, 9, 12, 13], "Then": [0, 1, 2, 4, 5, 7, 9, 12], "re": [0, 2, 4, 9], "organ": [0, 1, 12], "f_timer_aft": [0, 9], "699": 0, "besid": 0, "field": [0, 6, 7, 8, 13], "anoth": [0, 1, 2, 4, 6, 7, 8], "offer": [0, 8, 11, 13], "show": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "step": [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12], "involv": [0, 2, 6, 8, 9, 11, 13], "get": [0, 1, 2, 4, 6, 7, 8, 11, 12], "out": [0, 1, 2, 4, 6, 7, 8, 9, 11, 12], "apply_trac": 0, "b0": [0, 1, 2, 6, 7], "name": [0, 1, 2, 6, 7, 9], "func_nam": [0, 1, 9, 12], "l1": 0, "l2": 0, "l3": 0, "l4": 0, "l5": 0, "preserve_unit_it": 0, "disable_pred": 0, "fals": [0, 1, 2, 6, 7], "b6": 0, "The": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11], "abov": [0, 1, 2, 5, 6, 7, 8, 9, 11], "align": 0, "specifi": [0, 7, 8, 9], "One": [0, 2, 5, 6, 7, 8, 9, 11], "note": [0, 1, 2, 4, 5, 6, 7, 9, 12, 13], "plu": 0, "origin": [0, 1, 9, 12], "give": [0, 2, 7, 8, 12], "complet": [0, 12], "deriv": 0, "final": [0, 2, 4, 6, 7, 8, 9, 11, 12], "keep": [0, 2, 4, 7, 8, 9], "mind": [0, 2, 9], "throughout": 0, "inspect": [0, 4, 9], "up": [0, 1, 2, 5, 6, 7, 8, 9], "until": [0, 9], "now": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12], "everi": [0, 1, 2, 7], "detail": [0, 2, 5, 8, 9], "want": [0, 2, 6, 7, 8, 9, 12], "make": [0, 1, 6, 8, 9, 12], "tensorir": [0, 1, 10, 13], "those": [0, 6, 7, 8, 9], "choic": [0, 8, 9, 13], "base": [0, 9, 11], "understand": [0, 5, 8, 9, 12], "underli": [0, 2], "cach": [0, 4, 6, 9], "hardwar": [0, 1, 4, 8, 9, 12, 13], "unit": [0, 5, 9, 10, 11], "howev": [0, 1, 4, 6, 7, 8], "practic": [0, 2, 4, 8, 9, 11], "mai": [0, 1, 2, 8, 12], "abl": [0, 2, 7, 9, 11], "decid": 0, "accur": 0, "instead": [0, 1, 2, 6, 7, 9], "do": [0, 1, 2, 5, 6, 7, 8, 9, 11, 12], "would": [0, 2, 6, 8, 9], "like": [0, 1, 2, 4, 6, 8, 9], "possibl": [0, 2, 5, 6, 8, 9, 10, 11], "while": [0, 1, 2, 5, 8, 9], "leav": [0, 6, 8], "natur": [0, 2, 8, 9], "achiev": [0, 6], "goal": [0, 2, 6, 7, 8, 9, 13], "add": [0, 1, 2, 4, 5, 6, 7, 8, 11], "doe": [0, 1, 2, 4, 7, 8], "stochastic_schedule_mm": 0, "j_factor": [0, 9], "sample_perfect_til": 0, "2": [0, 2, 3, 4, 6, 7, 10, 13], "compar": [0, 1, 9, 12], "side": [0, 2, 6, 7, 8, 9, 11], "find": [0, 1, 2, 5, 6, 7, 8, 9], "onli": [0, 1, 2, 4, 6, 7, 8, 9, 12], "case": [0, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13], "pass": [0, 1, 2, 6, 7, 9, 12], "paramet": [0, 4, 6, 7, 13], "come": [0, 2, 5, 6, 8, 9, 11, 12, 13], "As": [0, 1, 2, 5, 7, 8, 9], "suggest": 0, "tri": 0, "draw": 0, "number": [0, 4, 8], "fill": 0, "It": [0, 1, 2, 7, 8, 9, 12], "sampl": 0, "thei": [0, 4, 5, 7, 8, 9], "perfectli": 0, "For": [0, 2, 4, 5, 6, 7, 8, 11], "exampl": [0, 1, 2, 3, 6, 8, 9, 11, 13], "when": [0, 1, 2, 7, 8, 9, 11], "size": [0, 4, 8, 9], "includ": [0, 1, 2, 8, 9, 11], "8": [0, 1, 4, 9, 12], "16": [0, 2, 4, 5, 9, 12], "64": [0, 4, 5], "try": [0, 2, 4, 6, 7, 9, 12], "see": [0, 1, 2, 8, 9, 12], "effect": [0, 2, 5, 8, 9, 11], "multipl": [0, 1, 2, 3, 5, 7, 8, 9], "observ": [0, 5, 8, 9], "outcom": 0, "you": [0, 1, 2, 9, 12], "might": [0, 2, 8, 9], "bound": [0, 9], "chang": [0, 2, 4, 9, 11], "each": [0, 1, 2, 4, 6, 7, 8, 9, 11], "happen": [0, 5, 8, 9], "here": [0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13], "randomli": [0, 9], "latest": [0, 5, 8], "one": [0, 1, 2, 4, 5, 6, 7, 8, 10, 12], "decis": [0, 2, 6], "made": 0, "v4": 0, "v5": 0, "max_innermost_factor": 0, "l6": 0, "l7": 0, "b8": 0, "look": [0, 4, 5, 6, 9], "pai": 0, "close": [0, 8, 9], "attent": 0, "part": [0, 2, 3, 7, 8, 9, 11, 13], "correspond": [0, 1, 2, 4, 5, 6, 7, 8, 9, 11], "valu": [0, 2, 4, 6, 7, 8, 9, 12], "sampling_perfect_til": 0, "pick": [0, 8, 12], "last": [0, 2, 9, 11], "altern": [0, 1, 5, 6, 7], "take": [0, 2, 4, 5, 7, 8, 9, 11, 12], "deeper": [0, 8], "simpl": [0, 1, 2, 4, 7, 12], "gener": [0, 1, 2, 4, 5, 6, 7, 8], "determinist": 0, "two": [0, 2, 4, 5, 6, 7, 8, 9, 12], "addit": [0, 5, 6, 8, 9, 11], "variabl": [0, 1, 7, 9, 11], "other": [0, 2, 6, 7, 8, 9], "oper": [0, 1, 2, 5, 6, 8, 9, 11, 12, 13], "cover": [0, 2, 5, 6, 8, 9], "action": [0, 2], "expr": [0, 2, 6, 7], "var": [0, 1, 6, 7], "real": [0, 2, 5, 6, 8, 9, 13], "integ": 0, "symbol": 0, "refer": [0, 2, 7, 8, 9, 11, 13], "being": [0, 1, 2, 6, 7, 8, 9], "api": [0, 1, 2, 6, 8, 9], "1": [0, 2, 3, 5, 6, 7, 9, 10, 13], "track": [0, 7], "If": [0, 2, 5, 8, 9, 12], "current": [0, 9, 12], "point": [0, 5, 9], "remain": [0, 5, 8, 9], "sinc": [0, 2, 8], "yet": [0, 4], "ani": [0, 7, 9, 12], "These": [0, 2, 5, 6, 7, 8, 9, 12, 13], "record": 0, "retak": 0, "updat": [0, 5, 8, 9], "version": [0, 4, 6, 9, 12], "after": [0, 1, 2, 4, 7, 8, 9, 12], "taken": 0, "further": [0, 1, 5, 6, 7, 9, 12], "state": 0, "blockrv": 0, "0x4116ed0": 0, "realiz": 0, "space": [0, 5, 7], "specif": [0, 1, 2, 5, 7, 8, 9, 12], "initi": [0, 2, 5, 9, 11, 12], "intuit": 0, "exactli": [0, 6, 12], "Of": [0, 2, 8, 9], "cours": [0, 2, 4, 8, 9, 13], "question": [0, 2, 8, 9], "ask": [0, 2, 9], "best": [0, 1, 7, 8, 12], "need": [0, 1, 2, 4, 6, 7, 8, 9, 11, 12], "algorithm": 0, "done": [0, 1, 8, 9], "straightforward": 0, "repetit": 0, "benchmark": [0, 9], "book": 0, "histori": 0, "random_search": 0, "num_trial": 0, "5": [0, 1, 2, 5, 9, 12], "best_result": 0, "best_sch": 0, "result": [0, 2, 4, 5, 6, 7, 8, 10, 12], "attempt": [0, 2, 4, 9], "d": [0, 6, 7], "165": 0, "176": 0, "395": 0, "713": 0, "515": 0, "goe": 0, "few": [0, 1, 2, 4, 9], "five": 0, "trial": 0, "smarter": 0, "also": [0, 1, 2, 5, 7, 8, 9, 12], "provid": [0, 1, 4, 5, 6, 8, 9, 11, 12], "util": [0, 1, 2, 5, 6, 7, 9], "remot": 0, "devic": [0, 2, 4, 8], "interest": [0, 2, 7, 8, 9, 11], "meta": [0, 2], "capabl": 0, "meta_schedul": [0, 4], "namespac": 0, "support": [0, 2, 4, 5, 6, 7, 8, 13], "behind": [0, 2, 9], "scene": 0, "parallel": [0, 1, 4, 9, 11], "across": [0, 4], "avoid": [0, 6, 9], "evolutionari": 0, "despit": [0, 6], "magic": 0, "kei": [0, 2, 6, 9, 11, 13], "idea": [0, 2, 9], "good": [0, 2, 7, 8, 11, 12], "tune_tir": [0, 4], "help": [0, 1, 2, 4, 5, 7, 8, 9, 11], "solut": [0, 5, 8, 9], "within": [0, 1, 2, 4], "databas": [0, 4], "num": 0, "core": [0, 4], "max_trials_glob": [0, 4], "num_trials_per_it": [0, 4], "space_gener": 0, "schedulefn": 0, "work_dir": [0, 4], "tune_tmp": [0, 4], "tir_integr": [0, 4], "compile_tir": [0, 4], "2024": 0, "10": [0, 1, 2, 4, 6, 7, 8], "14": 0, "31": 0, "55": 0, "info": 0, "log": [0, 1], "directori": 0, "04": 0, "localbuild": 0, "max_work": 0, "05": 0, "localrunn": 0, "06": 0, "task_schedul": 0, "cc": [0, 5], "159": 0, "task": [0, 4, 12], "flop": 0, "weight": [0, 1, 2, 4, 7, 8, 12], "speed": [0, 5], "gflop": [0, 4], "latenc": 0, "4194304": 0, "total": 0, "debug": [0, 9], "318": 0, "id": 0, "180": 0, "taskschedul": 0, "07": 0, "193": 0, "send": 0, "builder": [0, 6, 13], "08": 0, "195": 0, "runner": 0, "09": 0, "xgb": 0, "iter": [0, 2, 4, 5, 6, 7, 11], "tr": 0, "p": 0, "rmse": 0, "327088": 0, "peak": 0, "931043": 0, "340102": 0, "25": 0, "151834": 0, "000000": 0, "076069": 0, "50": 0, "146575": 0, "074000": 0, "75": 0, "146401": 0, "100": [0, 1, 2, 7], "146396": 0, "125": 0, "stop": 0, "98": 0, "14640": 0, "00000": 0, "07400": 0, "237": 0, "8929": 0, "248": 0, "2878": 0, "288": 0, "11": [0, 4], "12": [0, 12], "13": [0, 1], "260": 0, "ha": [0, 1, 2, 4, 7], "finish": [0, 4, 12], "y": [0, 2, 4, 6, 7, 9, 12], "found": 0, "dure": [0, 2, 4, 5, 9, 12], "tune": [0, 4], "gt": [0, 2, 6, 7], "enter_postproc": 0, "177": 0, "workload": [0, 11, 13], "craft": 0, "metaschedul": [0, 4], "its": [0, 1, 2, 6, 8, 9, 13], "own": [0, 2, 7, 8, 9], "built": [0, 7, 8, 9, 11], "work": [0, 4, 7, 8], "broad": 0, "approach": [0, 6, 8, 9, 13], "auto": 0, "system": [0, 4, 5, 8], "remov": [0, 2], "line": [0, 1, 6, 7, 9], "under": [0, 2, 7, 8, 9], "hood": [0, 2, 7, 8, 9], "analyz": [0, 6, 9], "data": [0, 1, 2, 4, 5, 6, 7, 9, 12], "propos": 0, "won": [0, 9], "go": [0, 2, 4, 5, 6, 9, 12], "just": [0, 1, 2, 9], "coupl": 0, "analysi": [0, 6, 9], "mechan": [0, 2], "enhanc": 0, "touch": [0, 2, 7, 9], "topic": [0, 4, 8, 13], "futur": [0, 2, 9], "15": 0, "19": [0, 2], "38": 0, "52": 0, "393679": 0, "279481": 0, "038281": 0, "300408": 0, "038269": 0, "300430": 0, "34": 0, "03827": 0, "30043": 0, "53": 0, "3255": 0, "129": 0, "7520": 0, "752": 0, "158": 0, "much": [0, 1, 9], "faster": [0, 9], "than": [0, 7, 9], "glimps": 0, "purpos": [0, 2, 9], "At": 0, "more": [0, 1, 2, 4, 6, 7, 8, 9, 11, 13], "tile": [0, 4], "vector": [0, 1, 4, 5, 8, 11], "intermedi": [0, 2, 4, 5, 7, 8, 9], "unrol": [0, 1, 4], "b1": [0, 2, 6, 7], "annot": [0, 2, 5, 6, 9, 11], "block_or_loop": 0, "ann_kei": 0, "tiling_structur": 0, "ann_val": 0, "ssrsr": 0, "v6": 0, "v7": 0, "v8": 0, "l9": 0, "l10": 0, "l11": 0, "l12": 0, "v13": 0, "v14": 0, "v15": 0, "v16": 0, "l17": 0, "l18": 0, "l19": 0, "l20": 0, "v21": 0, "v22": 0, "l23": 0, "l24": 0, "b25": 0, "cache_writ": [0, 4, 5], "write_buffer_index": 0, "storage_scop": [0, 4, 5], "global": [0, 2, 5], "reverse_compute_at": [0, 4, 5, 9, 12], "preserve_unit_loop": 0, "index": [0, 1, 4, 7, 9, 11, 12], "v26": 0, "sample_categor": 0, "candid": 0, "512": 0, "prob": 0, "unroll_explicit": 0, "b27": 0, "unannot": 0, "b28": 0, "b29": 0, "get_child_block": 0, "l30": 0, "l31": 0, "l32": 0, "l33": 0, "l34": 0, "l35": 0, "l36": 0, "l37": 0, "l38": 0, "l39": 0, "l40": 0, "fuse": [0, 1, 4, 7, 9, 13], "l41": 0, "l42": 0, "l43": 0, "l44": 0, "l45": 0, "l46": 0, "b47": 0, "l48": 0, "l49": 0, "l50": 0, "l51": 0, "l52": 0, "l53": 0, "l54": 0, "l55": 0, "l56": 0, "b57": 0, "c_global": [0, 5], "alloc_buff": [0, 1, 2, 4, 5, 6, 7, 9, 12], "i_0_j_0_fused_fus": 0, "i_1": [0, 4, 5, 12], "i_2_init": [0, 4], "j_2_init": [0, 4], "i_3_init": 0, "j_3_fused_init": 0, "block_attr": 0, "k_0": [0, 4, 5], "i_2": [0, 4], "j_2": [0, 4], "k_1": [0, 4, 5], "i_3": 0, "j_3_fuse": 0, "ax0": [0, 4, 5, 6, 7, 9, 12], "ax1_fus": 0, "v0": [0, 4, 5], "v1": [0, 4, 5], "allow": [0, 2, 4, 5, 6, 7, 9], "express": [0, 1], "improv": [0, 8, 12], "perspect": [0, 9], "modular": 0, "replac": [0, 2, 6, 9], "implement": [0, 1, 2, 5, 9, 11, 12], "new": [0, 1, 2, 6, 8, 9, 12], "reus": [0, 4, 7], "layer": [0, 1, 2, 8, 9], "mlp": [0, 2, 6, 7], "torch": [0, 1, 2, 6, 7, 11, 12], "torchvis": [0, 1, 2, 6, 7], "test_data": [0, 1, 2, 6, 7], "dataset": [0, 1, 6, 7], "fashionmnist": [0, 1, 2, 6, 13], "train": [0, 1, 2, 6, 7, 8, 13], "download": [0, 1, 6, 7], "totensor": [0, 1, 2, 6, 7], "test_load": [0, 1, 2, 6, 7], "dataload": [0, 1, 2, 6, 7], "batch_siz": [0, 1, 2, 6, 7], "shuffl": [0, 1, 2, 6, 7], "class_nam": [0, 1, 2, 6, 7], "shirt": [0, 1, 2, 6, 7], "top": [0, 1, 2, 5, 6, 7, 9], "trouser": [0, 1, 2, 6, 7], "pullov": [0, 1, 2, 6, 7], "dress": [0, 1, 2, 6, 7], "coat": [0, 1, 2, 6, 7], "sandal": [0, 1, 2, 6, 7], "sneaker": [0, 1, 2, 6, 7], "bag": [0, 1, 2, 6, 7], "ankl": [0, 1, 2, 6, 7], "boot": [0, 1, 2, 6, 7], "img": [0, 1, 2, 6, 7], "label": [0, 1, 2, 6, 7], "reshap": [0, 1, 2, 6, 7, 12], "28": [0, 1, 2, 6, 7], "http": [0, 1, 2, 4, 6, 7, 9], "fashion": [0, 1, 2, 7, 8], "mnist": [0, 1, 2, 7], "s3": [0, 2, 7], "websit": [0, 2, 7], "eu": [0, 2, 7], "central": [0, 2, 7], "amazonaw": [0, 2, 7], "com": [0, 1, 2, 6, 7], "imag": [0, 1, 2, 7, 8, 12], "idx3": [0, 2, 7], "ubyt": [0, 2, 7], "gz": [0, 2, 7], "raw": [0, 1, 2, 6, 7], "extract": [0, 2, 7], "idx1": [0, 2, 7], "t10k": [0, 2, 7], "matplotlib": [0, 1, 2, 6, 7], "pyplot": [0, 1, 2, 6, 7], "plt": [0, 1, 2, 6, 7], "figur": [0, 2, 6, 7, 9, 11], "imshow": [0, 1, 2, 6, 7], "colorbar": [0, 2, 6, 7], "pre": [0, 1, 6, 7, 8, 11], "pack": 0, "hide": [0, 1, 2, 6, 7], "wget": [0, 1, 2, 6, 7], "nc": [0, 1, 7], "github": [0, 1, 2, 6, 7], "ai": [0, 1, 2, 4, 6, 7, 8, 9, 13], "web": [0, 1, 2, 6, 7], "fasionmnist_mlp_param": [0, 2, 6, 7], "pkl": [0, 1, 2, 6, 7], "remind": [0, 9], "pickl": [0, 1, 2, 6, 7], "mlp_param": [0, 2, 6, 7], "load": [0, 1, 4, 6, 7], "open": [0, 1, 2, 4, 6, 7, 8, 9], "rb": [0, 1, 2, 6, 7], "data_nd": [0, 2, 6, 7], "784": [0, 1, 2, 6, 7], "nd_param": [0, 2], "v": [0, 2, 4, 6], "item": [0, 1, 2, 6], "mixtur": [0, 2, 11], "where": [0, 1, 2, 4, 5, 7, 8, 9, 11, 12], "compon": [0, 8], "linear0": [0, 2, 7], "mymodulemixtur": [0, 2], "x": [0, 1, 2, 4, 6, 7], "w": [0, 1, 2, 5, 6, 7, 12], "z": [0, 2], "ss": [0, 1, 2, 5, 6, 7, 9, 12], "w0": [0, 1, 2, 6, 7], "w1": [0, 2, 6, 7], "dataflow": [0, 1, 6, 7], "lv0": [0, 1, 2, 6], "call_dps_pack": [0, 7], "lv1": [0, 2, 6, 7], "env": [0, 1, 2], "relu": [0, 1, 2, 7, 8, 9, 11, 12, 13], "linear": [0, 1, 2, 7, 8, 9, 11, 13], "register_func": [0, 1, 2], "overrid": [0, 1, 2], "torch_linear": [0, 1, 2], "ndarrai": [0, 1, 2, 5, 9, 12], "x_torch": [0, 1, 2], "from_dlpack": [0, 1, 2], "w_torch": [0, 1, 2], "b_torch": [0, 1, 2], "out_torch": [0, 1, 2], "mm": [0, 1, 2, 9], "lnumpy_relu": [0, 2], "maximum": [0, 2, 9], "bind": [0, 4, 6, 13], "correct": [0, 1, 5, 9], "predict": [0, 1, 2, 6, 7, 8, 11], "mymodulewithparam": [0, 2], "bindparam": [0, 2], "ex": [0, 2, 6, 7], "vm": [0, 1, 2, 6, 7], "virtualmachin": [0, 1, 2, 6, 7], "nd_re": [0, 2, 6, 7], "pred_kind": [0, 2, 6, 7], "argmax": [0, 1, 2, 6, 7], "befor": [0, 2, 6, 9, 12], "becaus": [0, 2, 4, 9], "small": [0, 4, 12], "fluctuat": 0, "between": [0, 9, 11], "overal": [0, 2, 4, 5, 7, 8, 9], "magnitud": 0, "ftimer": 0, "g": [0, 2, 8, 9, 11, 12], "138537": 0, "readi": [0, 2, 9], "summar": [0, 6, 9], "diagram": 0, "mod_linear": 0, "from_expr": 0, "with_attr": [0, 6, 9], "33": 0, "200832": 0, "18": [0, 2], "323621": 0, "999727": 0, "230641": 0, "031305": 0, "314151": 0, "031299": 0, "314168": 0, "51": 0, "30": [0, 2], "03130": 0, "31417": 0, "5321": 0, "23": 0, "5383": 0, "global_var": [0, 6], "pointer": 0, "insid": [0, 1, 5, 7, 8], "update_func": [0, 6], "mymodulewithparams2": 0, "new_func": 0, "gv": [0, 1, 6, 7], "get_global_var": 0, "i_0": [0, 4, 5, 12], "serial": [0, 12], "pragma_auto_unroll_max_step": 0, "pragma_unroll_explicit": 0, "y_init": [0, 9, 12], "112": 0, "7": [0, 2], "y_updat": [0, 9, 12], "ax1": [0, 4, 5, 6, 7], "metadata": [0, 2, 6, 7], "constant": [0, 2, 6, 7], "out_sinfo": [0, 2, 6, 7], "omit": [0, 2, 6, 7], "show_meta": [0, 2, 6, 7], "method": [0, 2, 6, 7, 8], "been": [0, 1, 4, 7, 8], "again": [0, 1, 2, 7, 9], "amount": [0, 1, 6, 8], "reduct": [0, 1, 5, 9, 12], "mainli": [0, 2, 9], "thank": 0, "0718722": 0, "notic": [0, 2, 8, 9], "focus": [0, 5, 9], "start": [0, 1, 2, 4, 5, 6, 8, 9, 13], "focu": [0, 1, 2, 7, 9, 12], "possibli": [0, 2], "without": [0, 2, 6], "nail": 0, "down": [0, 6, 8], "importantli": [0, 4, 8, 9, 11], "flow": [0, 2, 4, 5, 6, 7, 8, 9], "matter": [0, 8], "inform": [0, 6, 11], "below": [0, 1, 2, 4, 9, 11], "lectur": [0, 1, 2, 5, 7, 8, 9, 12], "introduc": [0, 1, 2, 4, 6, 7, 9, 12], "kind": [0, 2, 4, 7, 8, 13], "compos": [0, 1, 6, 7], "togeth": [0, 1, 2, 4, 5, 6, 7, 8, 9], "deploy": [0, 7, 8, 9, 11, 13], "ones": [0, 11], "familiar": [1, 2, 8], "manipul": 1, "classif": [1, 8], "command": [1, 4, 9], "instal": [1, 3, 10], "packag": [1, 3, 10], "python3": [1, 4, 9], "pip": [1, 4, 9], "nightli": [1, 4, 9], "f": [1, 4, 7, 9], "wheel": [1, 4, 9], "torchaudio": 1, "torchsummari": 1, "extra": [1, 6, 10], "url": 1, "org": 1, "whl": 1, "nn": [1, 6, 7, 12], "function": [1, 5, 8, 10, 12, 13], "test": [1, 4, 5, 9, 12], "topi": [1, 6, 7], "te": [1, 7, 9], "accept": 1, "batch": 1, "through": [1, 2, 5, 6, 8, 9, 11, 13], "convolut": [1, 4], "activ": [1, 2, 8, 9, 13], "pool": 1, "fulli": 1, "order": [1, 2, 7, 9, 11], "input_shap": [1, 7], "nchw": [1, 12], "layout": [1, 5, 12], "pytorch_model": 1, "list": [1, 2, 6, 7], "append": [1, 7], "conv2d": [1, 12], "in_channel": 1, "out_channel": 1, "kernel_s": 1, "bia": [1, 7], "maxpool2d": 1, "flatten": [1, 8], "in_featur": 1, "5408": 1, "out_featur": 1, "softmax": [1, 2, 11], "dim": 1, "sequenti": [1, 11], "name_map": 1, "conv2d_weight": 1, "conv2d_bia": 1, "linear0_weight": 1, "linear0_bia": 1, "6": 1, "linear1_weight": 1, "linear1_bia": 1, "param": [1, 6, 7], "named_paramet": 1, "from_numpi": [1, 7], "weight_map": 1, "map": [1, 4, 5, 8, 9, 11, 13], "fasionmnist_mlp_assignment_param": 1, "accuraci": 1, "84": 1, "file": 1, "around": [1, 2, 9], "83": 1, "eval": 1, "test_loss": 1, "no_grad": 1, "print_img": 1, "sum": [1, 3, 7, 9], "loss": 1, "nll_loss": 1, "max": [1, 2, 6, 7, 9, 12], "probabl": 1, "pred": 1, "keepdim": 1, "format": [1, 2, 7], "eq": 1, "view_a": 1, "len": 1, "ntest": 1, "averag": 1, "4f": 1, "0f": 1, "npimg": 1, "transpos": [1, 6, 7], "tvmscript": [1, 6, 7, 13], "hard": [1, 2], "manual": [1, 2, 4, 9, 12], "experienc": 1, "exercis": [1, 2, 10, 13], "primit": [1, 2, 5, 6, 7, 9, 10, 12, 13], "tensor": [1, 2, 3, 6, 8, 12, 13], "requir": [1, 9], "massiv": [1, 4], "engin": [1, 8], "effort": [1, 8], "moreov": 1, "error": [1, 9], "prone": 1, "imagin": 1, "dozen": 1, "exist": [1, 6, 7, 8, 9, 13], "tini": [1, 8], "bug": 1, "your": [1, 9, 12], "could": [1, 2, 8, 11], "annoi": 1, "fortun": 1, "simpler": 1, "blockbuild": [1, 6], "construct": [1, 5, 6, 7, 9, 13], "recal": [1, 2], "design": [1, 2, 6], "And": [1, 6, 7, 9, 12], "stand": 1, "emit_t": [1, 7], "convert": [1, 2], "descript": [1, 5, 8], "which": [1, 2, 4, 5, 6, 7, 8, 9, 11, 12], "wa": 1, "call_tir": [1, 6, 7], "well": [1, 2, 4, 8], "less": 1, "mistak": 1, "signatur": [1, 7, 9], "func": [1, 6, 12], "emit_te_exampl": 1, "instanc": [1, 2, 9], "bb": [1, 5, 6, 7], "dimension": [1, 4, 5, 9, 11], "128x128": 1, "serv": [1, 2, 8, 13], "contain": [1, 2, 4, 5, 6, 7, 8, 9, 11, 13], "got": 1, "shape": [1, 2, 6, 7, 8, 9], "fcomput": [1, 7, 9], "lambda": [1, 7, 9], "dyntensortyp": 1, "emit_output": [1, 6, 7], "emit_func_output": [1, 6, 7], "languag": [1, 2, 7, 8, 9, 11, 12], "python": [1, 2, 5, 6, 8, 9, 11, 12], "equival": [1, 2, 8, 9], "yourself": 1, "short": 1, "inventori": 1, "wrap": 1, "variou": 1, "encourag": [1, 4], "document": 1, "check": [1, 2, 9, 12], "easili": [1, 5], "should": [1, 2, 9, 11, 12], "reflect": 1, "create_model_via_emit_t": 1, "const": [1, 6, 7], "todo": [1, 5, 12], "build_mod": 1, "exec": 1, "dev": [1, 4], "check_equival": 1, "torch_model": 1, "rt_mod": [1, 4], "output_from_pytorch": 1, "output_from_relax": 1, "assert_allclos": [1, 5, 9, 12], "rtol": [1, 5, 9, 12], "1e": [1, 5, 9, 12], "talk": [1, 2, 6, 8, 9, 11], "integr": [1, 8, 13], "regist": [1, 5], "extern": [1, 2, 5, 9], "runtim": [1, 8, 9, 13], "matmul": [1, 5, 6, 7, 9], "mymodulewithexterncal": [1, 2], "pleas": [1, 12], "occur": 1, "emit": [1, 6, 7], "directli": [1, 2, 6, 7, 9, 12], "create_model_with_torch_func": 1, "similar": [1, 2, 7], "program": [1, 2, 3, 5, 6, 13], "challeng": [1, 8, 12, 13], "compute_inlin": 1, "inlin": [1, 5], "memori": [1, 2, 3, 8, 9, 13], "usag": [1, 8], "opposit": 1, "ax": [1, 4, 6, 7], "increas": [1, 4, 8], "before_inlin": 1, "handl": [1, 2, 5, 6, 9], "match_buff": [1, 2, 5], "schedul": [1, 2, 4, 5, 9, 12, 13], "before_fus": 1, "But": [1, 2, 12], "NOT": 1, "standard": [1, 2, 8, 9, 12], "answer": [1, 2, 8, 9], "sever": [1, 8, 11], "reason": [1, 2, 8, 9], "perform": [1, 2, 4, 5, 6, 7, 9, 11, 12, 13], "vari": 1, "accord": [1, 12], "describ": [1, 2, 7, 9], "along": [1, 8], "calcul": 1, "separ": [1, 6, 9], "target_func": 1, "rxplacehold": 1, "rxplaceholder_1": 1, "conv2d_nchw": 1, "26": 1, "bodi": [1, 6, 9], "i0_0_i1_0_i2_0_i3_0_fus": 1, "2704": 1, "i0_1_i1_1_fused_init": 1, "i2_1_i3_1_fused_init": 1, "conv2d_nchw_init": 1, "1352": 1, "ff": 1, "169": 1, "yy": 1, "xx": 1, "i4": 1, "i5": 1, "i6": 1, "i0_1_i1_1_fus": 1, "i2_1_i3_1_fus": 1, "conv2d_nchw_upd": 1, "rc": 1, "ry": 1, "rx": 1, "rrr": 1, "unlik": 1, "therefor": 1, "concret": [1, 2], "shown": [1, 2, 8, 9, 11], "your_block_nam": 1, "your_function_nam": 1, "pad": [1, 12], "decompos": [1, 5, 12], "transform": [2, 4, 6, 7, 8, 10, 11, 13], "thing": [2, 5, 7, 8, 9, 13], "aim": [2, 6, 8, 9, 12], "repres": [2, 6, 7, 8, 9, 11, 13], "plot": 2, "neural": [2, 8, 9], "network": [2, 8, 9], "consist": [2, 7], "score": 2, "un": 2, "normal": [2, 7], "still": [2, 4, 5, 6, 8, 9], "numpy_mlp": 2, "lv2": [2, 6, 7], "188178": 2, "21": 2, "545786": 2, "469866": 2, "1992307": 2, "854906": 2, "29": 2, "403358": 2, "41": 2, "75774": 2, "794556": 2, "952915": 2, "313015": 2, "pov": 2, "illustr": [2, 9], "low": [2, 4, 5, 6, 9, 11, 12], "demonstr": [2, 9], "alwai": [2, 9, 12], "explicitli": [2, 9], "alloc": [2, 7, 9], "lnumpy_linear0": 2, "lnumpy_relu0": 2, "lnumpy_linear1": 2, "lnumpy_mlp": 2, "With": [2, 8, 9], "relu0": 2, "int64": [2, 6, 7, 12], "saw": 2, "walk": [2, 9], "alreadi": [2, 6, 9], "usual": [2, 4, 5, 7, 8, 9, 11], "visual": [2, 9], "box": [2, 8], "arrow": 2, "seen": [2, 8, 9], "earlier": 2, "itself": [2, 8, 9], "commonli": [2, 4, 6, 9], "known": [2, 9], "machin": [2, 4, 5, 8, 9, 11], "framework": [2, 4, 8, 9, 11, 13], "bring": [2, 4, 5, 7, 8, 9, 13], "explain": [2, 6], "lnumpy_call_dps_pack": 2, "popul": [2, 13], "optim": [2, 3, 8, 9, 11, 13], "choos": [2, 6, 11], "ahead": [2, 6, 9], "why": [2, 13], "convent": [2, 9], "low_level_prim_func": 2, "in0": 2, "in1": 2, "destin": 2, "outsid": [2, 7], "higher": [2, 7, 9], "present": 2, "whose": 2, "nevertheless": [2, 8, 9], "common": [2, 4, 6, 8, 9, 12, 13], "assembl": [2, 8], "certainli": [2, 6, 9], "fail": 2, "fit": 2, "simpli": [2, 6, 8], "lost": 2, "nice": 2, "properti": [2, 5, 11], "edg": [2, 7], "outgo": 2, "arbitrarili": 2, "topolog": [2, 7], "definit": [2, 12], "complic": [2, 4], "associ": [2, 8], "back": [2, 6, 9, 12, 13], "insight": 2, "explicit": [2, 9], "formal": [2, 9], "term": [2, 8, 9], "pure": 2, "free": [2, 9], "increment": [2, 7, 9], "counter": [2, 6], "expos": 2, "rewrit": [2, 9, 13], "lnumpy_mlp_with_call_dps_pack": 2, "calltir": 2, "lowest": 2, "continu": [2, 6, 9], "actual": 2, "scope": [2, 4, 7], "had": 2, "ideal": 2, "mark": [2, 7, 9], "region": [2, 4, 5, 9], "gv0": [2, 6], "deal": 2, "gone": [2, 9], "compil": [2, 4, 5, 7, 9, 11], "concept": [2, 4, 8, 9], "encount": [2, 9], "later": [2, 8, 9], "enabl": [2, 5, 8, 9], "develop": [2, 4, 7, 8, 9, 13], "though": 2, "vm_build": 2, "virtual": 2, "executor": 2, "addition": [2, 5, 6, 13], "second": [2, 9], "argument": [2, 7, 9], "indic": [2, 4, 5, 7, 8, 9, 12], "545788": 2, "46987": 2, "199234": 2, "854907": 2, "403357": 2, "757736": 2, "79456": 2, "952917": 2, "313023": 2, "both": [2, 4, 8, 13], "expect": [2, 5, 7, 8, 9], "zero": [2, 4], "copi": [2, 5, 9], "convers": 2, "share": [2, 3], "dlpack": 2, "exchang": 2, "particular": [2, 4, 6, 8, 9, 11], "piggyback": 2, "pytorch": [2, 8, 13], "world": [2, 5, 6, 8, 9, 13], "redirect": [2, 5], "onto": [2, 4, 7, 8], "cudnn": 2, "realiti": 2, "verifi": [2, 9], "dispatch": 2, "sometim": [2, 7, 11], "rest": [2, 6], "valid": [2, 9, 12], "attach": 2, "match": [2, 5, 13], "relai": 2, "implicit": 2, "dictionari": 2, "store": [2, 4, 7, 9], "invok": [2, 8], "theme": [2, 5, 6, 8, 13], "invoc": 2, "stage": [2, 4, 5, 7, 8, 9], "e": [2, 8, 9, 11, 12], "taught": 2, "spend": 2, "think": [2, 8], "hand": [2, 6, 8, 11], "hundr": 2, "infeas": 2, "peek": 2, "interact": [2, 5, 10], "episod": 2, "programmat": [2, 7, 9], "stitch": 2, "emb": 2, "prepar": [3, 13], "architectur": [3, 8], "window": [3, 12], "matrix": [3, 5, 7, 9], "leverag": [3, 6, 8, 9, 11, 13], "automat": [3, 8, 13], "summari": [3, 10, 13], "special": [3, 4, 8, 9, 11], "trend": [3, 8], "discuss": [3, 4, 8, 10, 13], "cuda": 4, "terminologi": 4, "appli": [4, 7, 8, 9], "ongo": [4, 13], "sourc": [4, 5, 8, 9], "notebook": 4, "cu110": 4, "typic": [4, 8, 9, 11], "collect": [4, 6, 7, 8, 9], "stream": 4, "multi": [4, 5, 9, 11], "processor": [4, 5], "execut": [4, 5, 7, 8, 9, 11, 12, 13], "concurr": 4, "multiprocessor": 4, "wise": 4, "mymodulevecadd": 4, "1024": [4, 5], "i0": [4, 5, 6, 7, 12], "i1": [4, 5, 6, 7, 12], "parameter": 4, "threadidx": 4, "blockidx": 4, "dimens": [4, 7, 9], "thread_bind": 4, "uniform": 4, "move": [4, 9], "forward": [4, 7], "basic": [4, 5, 6, 9], "predefin": 4, "slide": 4, "over": [4, 5, 7, 9, 13], "neighbor": 4, "mymodulewindowsum": 4, "1027": 4, "nthread": 4, "opportun": [4, 5, 7, 8], "rememb": 4, "cache_read": [4, 5], "segment": 4, "green": 4, "a_shar": 4, "read_buffer_index": 4, "compute_at": [4, 5, 12], "130": 4, "inner": [4, 9], "fetch": [4, 7, 9], "techniqu": [4, 8, 13], "cooper": 4, "ax0_0": 4, "ax0_1": 4, "lt": 4, "host": 4, "driver": 4, "kernel": [4, 5], "quick": [4, 9], "notabl": [4, 6, 7, 8, 9, 11, 12], "compact": [4, 8], "minimum": [4, 6], "imported_modul": 4, "get_sourc": 4, "metal": [4, 8], "model": [4, 5, 6, 8, 11, 12, 13], "someth": [4, 8, 12], "slightli": [4, 9], "mymodulematmul": 4, "stripe": 4, "onc": [4, 7], "pressur": 4, "tile_local_i": 4, "tile_local_x": 4, "tile_block_i": 4, "tile_block_x": 4, "tile_k": 4, "c_local": 4, "i2": 4, "j0": [4, 5, 9], "j1": [4, 5, 9], "j2": 4, "k0": [4, 5, 12], "k1": [4, 5, 12], "256": 4, "num_flop": 4, "gemm": 4, "1e9": 4, "consid": 4, "sit": 4, "piec": [4, 9], "cache_read_and_coop_fetch": 4, "read_idx": 4, "read_loc": 4, "read_cach": 4, "inner0": 4, "inner1": 4, "_": 4, "tx": 4, "vec": 4, "blocking_with_shar": 4, "b_share": 4, "i_1_j_1_fus": 4, "ax0_ax1_fused_0": 4, "ax0_ax1_fused_1": 4, "ax0_ax1_fused_2": 4, "min": 4, "nvidia": 4, "tesla": 4, "p100": 4, "studi": [4, 5, 6, 7, 10, 12, 13], "acceler": [4, 5, 8, 9, 13], "bridg": 4, "toward": [4, 5, 12], "visit": [4, 6, 11], "incom": 4, "hierarchi": [4, 5], "gpu": [5, 8, 13], "conceptu": 5, "backend": 5, "landscap": 5, "emerg": [5, 8, 13], "recent": 5, "tradition": 5, "scalar": 5, "float": 5, "instruct": [5, 8, 9, 12], "avx": 5, "arm": [5, 8], "neon": 5, "complex": [5, 8], "better": [5, 9], "resembl": [5, 9], "accel_fill_zero": 5, "accel_tmm_add": 5, "accel_dma_copi": 5, "reg": 5, "dram": 5, "lnumpy_tmm": 5, "accumul": 5, "c_accumul": 5, "a_reg": 5, "b_reg": 5, "16x16x16": 5, "limit": 5, "effici": [5, 8], "vendor": 5, "confirm": 5, "correctli": 5, "c_tmm": 5, "c_np": [5, 9, 12], "structur": [5, 6, 10], "group": [5, 6], "relev": [5, 9], "matmulblockmodul": 5, "tmm": 5, "16x16": 5, "vi0": 5, "vj0": 5, "vk0": 5, "tmm_init": 5, "vi1": 5, "vj1": 5, "vk1": 5, "closer": [5, 9], "content": [5, 8, 9], "subregion": 5, "span": 5, "sub": 5, "produc": [5, 9], "surround": 5, "variant": [5, 8, 12], "block_mm": 5, "i0_0": 5, "i0_1": 5, "sr": 5, "matmulmodul": 5, "ii": 5, "ji": 5, "ki": 5, "matmul_o": 5, "vi_o": 5, "vj_o": 5, "vk_o": 5, "matmul_init": 5, "vi_i_init": 5, "vj_i_init": 5, "vi_i": 5, "vj_i": 5, "vk_i": 5, "write_back_block": 5, "a_global_a_reg": 5, "b_global_b_reg": 5, "c_global_accumul": 5, "a_glob": 5, "b_global": 5, "thread": [5, 9], "tag": 5, "intrins": 5, "tensorintrin": 5, "tmm16_desc": 5, "offset_factor": 5, "vii": 5, "vjj": 5, "vkk": 5, "tmm16_impl": 5, "sa": 5, "int32": 5, "sb": 5, "sc": 5, "stride": [5, 12], "call_extern": 5, "tmm16": 5, "access_ptr": 5, "matmul_o_init": 5, "matmul_o_upd": 5, "a_1": 5, "a_s0": 5, "b_1": 5, "b_s0": 5, "c_1": 5, "c_s0": 5, "tvm_access_ptr": 5, "type_annot": 5, "elem_offset": 5, "downstream": 5, "micro": 5, "embed": [5, 8, 9], "assembli": [5, 8, 11], "tmm_kernel": 5, "cc_code": 5, "int": 5, "aa": 5, "stride_a": 5, "stride_b": 5, "stride_c": 5, "contrib": 5, "clang": 5, "temp": [5, 7], "tempdir": 5, "ll_path": 5, "relpath": 5, "ll": 5, "ll_code": 5, "create_llvm": 5, "pragma_import_llvm": 5, "custom": [5, 6, 7, 8], "ci": [5, 12], "section": [5, 6, 7, 8, 10, 11], "alongsid": 5, "foundat": 5, "constraint": 5, "don": [5, 9], "enough": [5, 12], "option": [5, 12], "individu": 6, "defer": 6, "pars": 6, "op": [6, 7], "multipli": 6, "ewise_fma": 6, "dive": [6, 8], "examin": 6, "syntax": [6, 9], "tree": 6, "ast": [6, 9], "relax_func": 6, "node": [6, 7, 8], "func_bodi": 6, "seqexpr": 6, "sequenc": [6, 7, 8, 9, 11, 13], "dataflow_block": 6, "left": [6, 8], "right": [6, 8, 11], "travers": 6, "recurs": 6, "avail": [6, 7], "tool": [6, 7, 8, 9], "simplifi": [6, 9], "visitor": 6, "expr_functor": 6, "mutat": 6, "ewisefmarewrit": 6, "pyexprmut": 6, "visit_call_": 6, "self": [6, 7, 9], "visit_expr_post_ord": 6, "add_op": 6, "multiply_op": 6, "ewise_fma_op": 6, "lookup_bind": 6, "arg": [6, 7], "isinst": [6, 7], "fma_cal": 6, "updated_fn": 6, "visit_expr": 6, "remove_all_unus": 6, "tast": [6, 9], "end": [6, 7, 8, 9, 13], "reconstruct": 6, "create_model": 6, "tensorstructinfo": [6, 7], "permute_dim": [6, 7], "lv3": [6, 7], "lv4": [6, 7], "mlpmodel": 6, "lv": [6, 7], "out_dtyp": [6, 7], "void": [6, 7], "lv5": [6, 7], "lv6": [6, 7], "dens": [6, 7], "identifi": 6, "matmuladdfusor": 6, "__init__": [6, 7], "super": [6, 7], "mod_": 6, "matmul_op": 6, "attr": [6, 7], "updated_func": 6, "builder_": 6, "match_cal": 6, "param_x": 6, "struct_info": 6, "param_w": 6, "param_b": 6, "fn_name": 6, "fused_matmul_add": 6, "attribut": 6, "funtion": 6, "fused_fn": 6, "add_func": 6, "module_pass": 6, "opt_level": 6, "matmuladdfus": 6, "fusedenseaddpass": 6, "wrapper": 6, "lowertensorir": 6, "transform_modul": 6, "ctx": 6, "mlpfuse": 6, "fused_matmul_add0": 6, "fused_matmul_add1": 6, "cl": [6, 7], "prefix": 6, "fuse_matmul_add": 6, "exponenti": 6, "combin": [6, 7, 9, 13], "lower": [6, 9], "dedic": 6, "fusion": [6, 7], "translat": [6, 8, 12, 13], "intern": 6, "call_t": 6, "lowertotensorir": 6, "op_map": 6, "map_matmul": [6, 7], "map_add": 6, "map_relu": [6, 7], "map_transpos": 6, "lowertotensorirpass": 6, "mlpmodeltir": 6, "privat": [6, 7], "t_add": [6, 7], "v_ax0": [6, 7], "v_ax1": [6, 7], "add1": [6, 7], "t_matmul_nn": 6, "layout_free_buff": [6, 7], "v_i0": [6, 7], "v_i1": [6, 7], "v_k": [6, 7, 9], "matmul1": 6, "t_transpos": 6, "transpose1": 6, "phase": [6, 8], "mlpmodelfin": 6, "fusetir": 6, "t_add_intermedi": 6, "t_matmul_nn_intermedi": 6, "pictur": 6, "mlpmodul": [6, 7], "power": [6, 8, 9], "robust": 6, "duplic": 6, "referenc": [6, 7], "detect": 6, "skip": [6, 7, 9], "rule": [6, 7], "fusor": 6, "explor": [6, 8], "feed": [6, 9], "ml": [7, 13], "fx": 7, "larger": [7, 13], "domain": [7, 9], "placehold": [7, 9], "object": [7, 9], "output_shap": [7, 9], "given": [7, 9, 12], "te_matmul": 7, "assert": 7, "reduce_axi": [7, 9], "create_prim_func": [7, 9], "v_i": [7, 9], "v_j": [7, 9], "te_relu": 7, "x1": 7, "y1": 7, "x2": 7, "20": 7, "y2": 7, "caus": 7, "caller": 7, "recommend": [7, 8], "advanc": 7, "put": [7, 13], "belong": [7, 8], "dataflowvar": 7, "emiss": 7, "fly": 7, "obtain": [7, 8, 9], "nativ": [7, 8, 9], "mymodel": 7, "randn": 7, "trace": 7, "fx_modul": 7, "symbolic_trac": 7, "graph_modul": 7, "__new__": 7, "local": [7, 8], "graphmoduleimpl": 7, "tabular": 7, "print_tabular": 7, "opcod": 7, "kwarg": 7, "get_attr": 7, "call_funct": 7, "0x7f28e7c5c480": 7, "logic": 7, "node_map": 7, "map_param": 7, "fetch_attr": 7, "fx_mod": 7, "str": 7, "target_atom": 7, "attr_itr": 7, "atom": 7, "enumer": 7, "hasattr": 7, "rais": 7, "runtimeerror": 7, "nonexist": 7, "join": 7, "getattr": 7, "from_fx": 7, "call_function_map": 7, "call_module_map": 7, "input_index": 7, "named_modul": 7, "dict": 7, "fn_input": 7, "fn_output": 7, "input_var": 7, "elif": 7, "call_modul": 7, "suppli": 7, "linear1": 7, "mlp_model": 7, "torch_r": 7, "detach": 7, "broadcast": 7, "map_nn_linear": 7, "nn_mod": 7, "map_nn_relu": 7, "t_matmul_nt": 7, "dense1": 7, "map_nn_relu_op": 7, "map_nn_linear_op": 7, "mlpmodulehighlevel": 7, "either": [7, 9], "builtin": 7, "cannot": [7, 11], "tradeoff": 7, "briefli": 7, "upon": 7, "applic": [8, 13], "undoubtedli": 8, "becom": [8, 13], "ubiquit": [8, 13], "smart": 8, "home": 8, "speech": 8, "recognit": 8, "vision": 8, "backbon": 8, "autonom": 8, "discov": 8, "rich": 8, "app": 8, "quit": 8, "fun": 8, "deploi": [8, 9, 13], "cloud": 8, "platform": [8, 9], "compani": 8, "servic": 8, "pop": 8, "head": 8, "vehicl": 8, "intellig": 8, "phone": [8, 9], "recogn": 8, "flower": 8, "garden": 8, "tend": 8, "iot": 8, "sensor": 8, "chip": 8, "drill": 8, "even": [8, 9], "greater": 8, "divers": 8, "categori": 8, "x86": 8, "heavi": 8, "lift": 8, "product": [8, 13], "extend": 8, "non": 8, "relat": [8, 9, 11], "infer": [8, 13], "themselv": [8, 11], "user": 8, "privaci": 8, "protect": 8, "scale": [8, 13], "distribut": 8, "cluster": 8, "scenario": 8, "productionis": 8, "gap": 8, "facilit": [8, 11], "commun": 8, "appear": [8, 12], "written": 8, "tensorflow": 8, "jax": 8, "routin": 8, "manag": 8, "resourc": 8, "interfac": 8, "java": 8, "android": [8, 9], "analogi": 8, "tradit": 8, "necessarili": 8, "That": 8, "worthwhil": 8, "independ": [8, 9, 11], "minim": [8, 13], "camera": 8, "classifi": 8, "tabl": 8, "lookup": 8, "nlp": 8, "abil": [8, 9, 11], "especi": 8, "tensorcor": 8, "strict": 8, "boundari": 8, "pair": 8, "stabl": 8, "fact": [8, 9], "collabor": 8, "background": 8, "grow": 8, "scientist": 8, "teach": 8, "methodologi": 8, "who": 8, "wild": 8, "bread": 8, "butter": 8, "solv": [8, 13], "problem": [8, 13], "principl": [8, 9], "experi": 8, "depth": 8, "hidden": 8, "incorpor": [8, 11], "novel": 8, "push": 8, "rational": 8, "my": 8, "isn": 8, "fast": [8, 9], "softwar": 8, "stack": 8, "isol": 8, "plai": 8, "increasingli": 8, "role": 8, "ecosystem": 8, "modern": [8, 9], "bare": 8, "realli": 8, "pixel": 8, "project": 8, "length": [8, 9, 11], "200": 8, "foremost": 8, "multidimension": 8, "knowledg": 8, "encod": 8, "entir": 8, "fold": 8, "linear_relu": 8, "nest": [8, 9, 11], "represent": 8, "denot": 8, "perhap": 8, "keyword": 8, "interpret": [8, 9], "todai": 9, "recap": [9, 13], "apach": 9, "explan": 9, "motiv": 9, "matric": 9, "y_": [9, 12], "sum_k": [9, 12], "a_": [9, 12], "b_": [9, 12], "c_": [9, 12], "mathbb": [9, 12], "c_mm_relu": 9, "openbla": 9, "restrict": 9, "subset": 9, "arithmet": 9, "primarili": 9, "lnumpy_mm_relu": 9, "mm_relu": 9, "storag": 9, "revisit": 9, "tutori": [9, 13], "slower": 9, "statement": [9, 11], "dialect": 9, "direct": 9, "similarli": 9, "syntact": 9, "coresspond": 9, "plain": 9, "declar": 9, "block_axi": 9, "axis_typ": 9, "axis_rang": 9, "mapped_valu": 9, "suppos": 9, "impli": 9, "strictli": 9, "speak": 9, "fix": 9, "locat": 9, "crucial": 9, "carri": [9, 11], "127": 9, "wrong": 9, "due": [9, 12], "mismatch": 9, "strategi": 9, "situat": 9, "outer": 9, "range_of_i": 9, "range_of_j": 9, "range_of_k": 9, "mymodulewithaxisremapsugar": 9, "area": 9, "overlap": 9, "feel": 9, "safe": [9, 11], "affect": 9, "hold": 9, "primfunc": 9, "mymodulewithtwofunct": 9, "temporari": 9, "went": 9, "journei": 9, "ingredi": 9, "lnumpy_mm_relu_v2": [9, 12], "variat": 9, "edit": 9, "pragmat": [9, 11], "ourselv": 9, "block_i": 9, "procedur": [9, 11], "accident": 9, "twice": 9, "longer": 9, "kept": 9, "conveni": 9, "sync": 9, "implicitli": 9, "lnumpy_mm_relu_v3": 9, "takeawai": 9, "paradigm": 9, "auxiliari": 9, "runnabl": 9, "intel": 9, "skylak": 9, "adjust": 9, "accordingli": 9, "rt_lib": [9, 12], "func_mm_relu": 9, "rt_lib_aft": 9, "sec": 9, "00252932": 9, "000657921": 9, "uniformli": 9, "behavior": 9, "lead": 9, "mod_transform": 9, "rt_lib_transform": 9, "f_timer_transform": 9, "000380994": 9, "certain": 9, "shorten": 9, "middl": 9, "bake": 9, "ij": 9, "ik": 9, "kj": 9, "te_func": 9, "mymodulefromt": 9, "repeat": 9, "veri": 9, "equal": 9, "lot": 9, "expertis": 9, "prelud": [10, 13], "introductori": 11, "overview": 11, "pseudo": 11, "f32x4": 11, "arbitrari": 11, "luckili": 11, "handi": 11, "know": 11, "long": 11, "arang": 12, "conv": 12, "lnumpy_add": 12, "c_lnumpi": 12, "myadd": 12, "a_tvm": 12, "b_tvm": 12, "c_tvm": 12, "mathemat": 12, "sum_": 12, "di": 12, "dj": 12, "q": 12, "channel": 12, "hight": 12, "width": 12, "filter": 12, "h": 12, "co": 12, "out_h": 12, "out_w": 12, "data_torch": 12, "weight_torch": 12, "conv_torch": 12, "474": 12, "510": 12, "546": 12, "582": 12, "618": 12, "654": 12, "762": 12, "798": 12, "834": 12, "870": 12, "906": 12, "942": 12, "1050": 12, "1086": 12, "1122": 12, "1158": 12, "1194": 12, "1230": 12, "1338": 12, "1374": 12, "1410": 12, "1446": 12, "1482": 12, "1518": 12, "1626": 12, "1662": 12, "1698": 12, "1734": 12, "1770": 12, "1806": 12, "1914": 12, "1950": 12, "1986": 12, "2022": 12, "2058": 12, "2094": 12, "1203": 12, "1320": 12, "1437": 12, "1554": 12, "1671": 12, "1788": 12, "2139": 12, "2256": 12, "2373": 12, "2490": 12, "2607": 12, "2724": 12, "3075": 12, "3192": 12, "3309": 12, "3426": 12, "3543": 12, "3660": 12, "4011": 12, "4128": 12, "4245": 12, "4362": 12, "4479": 12, "4596": 12, "4947": 12, "5064": 12, "5181": 12, "5298": 12, "5415": 12, "5532": 12, "5883": 12, "6000": 12, "6117": 12, "6234": 12, "6351": 12, "6468": 12, "myconv": 12, "data_tvm": 12, "weight_tvm": 12, "conv_tvm": 12, "bmm_relu": 12, "batched_matmul_relu": 12, "bmm": 12, "lnumpi": 12, "hint": 12, "mybmmrelu": 12, "student": 12, "targetmodul": 12, "i2_0": 12, "ax0_init": 12, "ax1_0": 12, "ax1_1": 12, "i2_1": 12, "sure": 12, "assert_structural_equ": 12, "before_rt_lib": 12, "after_rt_lib": 12, "before_tim": 12, "f_timer": 12, "innov": 13, "daili": 13, "live": 13, "great": 13, "combinatori": 13, "multitud": 13, "broader": 13, "coverag": 13, "footprint": 13, "driven": 13, "search": 13, "comprehens": 13, "treatment": 13, "systemat": 13, "materi": 13, "progress": 13, "introduct": 13, "mix": 13, "stochast": 13, "remark": 13}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"automat": [0, 4], "program": [0, 4, 9, 10, 11, 12], "optim": [0, 4, 6], "prelud": [0, 2, 6, 7, 9], "prepar": [0, 1, 2, 4, 5, 6, 7], "recap": 0, "transform": [0, 1, 5, 9, 12], "primit": [0, 11], "tensor": [0, 5, 7, 9, 10, 11], "function": [0, 2, 4, 6, 7, 9, 11], "trace": 0, "stochast": 0, "schedul": 0, "deep": [0, 7], "dive": [0, 7], "search": 0, "over": 0, "leverag": [0, 4], "default": 0, "autoschedul": 0, "section": [0, 1, 2, 9, 12], "checkpoint": [0, 2, 9], "put": 0, "thing": 0, "back": [0, 7], "end": [0, 1, 2], "model": [0, 1, 2, 7], "execut": [0, 1, 2], "discuss": [0, 2, 5, 6, 7, 9], "summari": [0, 2, 4, 5, 6, 7, 8, 9, 11], "mlc": 1, "assign": 1, "1": [1, 4, 12], "2": [1, 5, 12], "ingest": 1, "from": [1, 7], "pytorch": [1, 7], "3": [1, 12], "us": [1, 7, 9], "vendor": 1, "librari": [1, 2], "4": 1, "load": 2, "dataset": 2, "download": 2, "paramet": [2, 9], "integr": [2, 7], "construct": 2, "an": [2, 7], "irmodul": [2, 7], "tvmscript": [2, 9], "comput": [2, 5, 6, 9], "graph": [2, 6], "view": 2, "call_dps_pack": 2, "dataflow": 2, "block": [2, 4, 5, 7, 9], "build": [2, 4, 6, 7, 9, 12], "run": [2, 4, 6, 9], "exist": 2, "environ": 2, "regist": 2, "runtim": 2, "mix": 2, "tensorir": [2, 4, 5, 6, 7, 9, 12], "code": [2, 4, 5, 9], "bind": [2, 9], "gpu": [3, 4], "hardwar": [3, 5], "acceler": 3, "part": [4, 5], "instal": [4, 9], "packag": [4, 9], "architectur": 4, "thread": 4, "window": 4, "sum": 4, "exampl": [4, 7, 12], "other": 4, "platform": 4, "matrix": 4, "multipl": 4, "local": 4, "share": 4, "memori": [4, 5], "special": 5, "trend": 5, "kei": [5, 8], "element": [5, 8, 12], "A": 5, "loop": [5, 9], "around": 5, "blockiz": 5, "creat": [5, 6, 7, 9], "introduc": 5, "scope": 5, "pattern": 6, "match": 6, "rewrit": 6, "fuse": 6, "linear": 6, "relu": 6, "why": [6, 8, 9], "sub": 6, "map": [6, 7], "call": 6, "machin": [7, 13], "learn": [7, 9, 13], "framework": 7, "through": 7, "builder": 7, "express": [7, 9], "creation": 7, "blockbuild": 7, "api": 7, "import": 7, "torchfx": 7, "graphmodul": 7, "come": 7, "fashionmnist": 7, "remark": [7, 8], "translat": 7, "high": 7, "level": 7, "oper": 7, "introduct": 8, "what": 8, "i": 8, "ml": 8, "compil": [8, 13], "studi": [8, 9], "abstract": [8, 9, 10, 11], "implement": 8, "case": 9, "one": 9, "buffer": 9, "For": 9, "iter": 9, "axi": 9, "properti": 9, "extra": [9, 11], "inform": 9, "sugar": 9, "ax": 9, "attribut": 9, "decor": 9, "get": 9, "anoth": 9, "variant": 9, "exercis": [9, 12], "wai": 9, "interact": 9, "via": 9, "gener": 9, "result": 9, "structur": 11, "how": 12, "write": 12, "wise": 12, "add": 12, "broadcast": 12, "2d": 12, "convolut": 12, "parallel": 12, "vector": 12, "unrol": 12, "batch": 12, "matmul": 12, "evalu": 12}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinxcontrib.bibtex": 9, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"GPU and Hardware Acceleration": [[3, "gpu-and-hardware-acceleration"]], "Part 2": [[5, "part-2"]], "Preparations": [[5, "preparations"], [4, "preparations"], [6, "preparations"], [7, "preparations"], [2, "preparations"], [0, "preparations"]], "Hardware Specialization Trend": [[5, "hardware-specialization-trend"]], "Key Elements of Specialized Code": [[5, "key-elements-of-specialized-code"]], "A Block with Tensorized Computation": [[5, "a-block-with-tensorized-computation"]], "Transforming Loops Around Tensorized Block": [[5, "transforming-loops-around-tensorized-block"]], "Blockization \u2013 Creating Tensorized Blocks": [[5, "blockization-creating-tensorized-blocks"]], "Transforming TensorIR to Introduce Special Memory Scope": [[5, "transforming-tensorir-to-introduce-special-memory-scope"]], "Tensorization": [[5, "tensorization"]], "Discussions": [[5, "discussions"], [7, "discussions"], [9, "discussions"], [2, "discussions"], [0, "discussions"]], "Summary": [[5, "summary"], [4, "summary"], [6, "summary"], [8, "summary"], [7, "summary"], [9, "summary"], [11, "summary"], [2, "summary"], [0, "summary"]], "Part 1": [[4, "part-1"]], "Install packages": [[4, "install-packages"]], "GPU Architecture": [[4, "gpu-architecture"]], "GPU Thread Blocks": [[4, "gpu-thread-blocks"]], "Build and Run the TensorIR Function on GPU": [[4, "build-and-run-the-tensorir-function-on-gpu"]], "Window Sum Example": [[4, "window-sum-example"]], "Build Code for Other GPU Platforms": [[4, "build-code-for-other-gpu-platforms"]], "Matrix Multiplication": [[4, "matrix-multiplication"]], "Local Blocking": [[4, "local-blocking"]], "Shared Memory Blocking": [[4, "shared-memory-blocking"]], "Leveraging Automatic Program Optimization": [[4, "leveraging-automatic-program-optimization"]], "Computational Graph Optimization": [[6, "computational-graph-optimization"]], "Prelude": [[6, "prelude"], [7, "prelude"], [9, "prelude"], [2, "prelude"], [0, "prelude"]], "Pattern Match and Rewriting": [[6, "pattern-match-and-rewriting"]], "Fuse Linear and ReLU": [[6, "fuse-linear-and-relu"]], "Why Creating a Sub-function": [[6, "why-creating-a-sub-function"]], "Map to TensorIR Calls": [[6, "map-to-tensorir-calls"]], "Build and Run": [[6, "build-and-run"], [9, "build-and-run"], [2, "build-and-run"]], "Discussion": [[6, "discussion"]], "Introduction": [[8, "introduction"]], "What is ML Compilation": [[8, "what-is-ml-compilation"]], "Why Study ML Compilation": [[8, "why-study-ml-compilation"]], "Key Elements of ML Compilation": [[8, "key-elements-of-ml-compilation"]], "Remark: Abstraction and Implementations": [[8, "remark-abstraction-and-implementations"]], "Integration with Machine Learning Frameworks": [[7, "integration-with-machine-learning-frameworks"]], "Build an IRModule Through a Builder": [[7, "build-an-irmodule-through-a-builder"]], "Tensor Expression for TensorIR Creation": [[7, "tensor-expression-for-tensorir-creation"]], "Use BlockBuilder to Create an IRModule": [[7, "use-blockbuilder-to-create-an-irmodule"]], "Deep Dive into Block Builder APIs": [[7, "deep-dive-into-block-builder-apis"]], "Import Model From PyTorch": [[7, "import-model-from-pytorch"]], "Create TorchFX GraphModule": [[7, "create-torchfx-graphmodule"]], "Create Map Function": [[7, "create-map-function"]], "Coming back to FashionMNIST Example": [[7, "coming-back-to-fashionmnist-example"]], "Remark: Translating into High-level Operators": [[7, "remark-translating-into-high-level-operators"]], "TensorIR: Tensor Program Abstraction Case Study": [[9, "tensorir-tensor-program-abstraction-case-study"]], "Install Packages": [[9, "install-packages"]], "Learning one Tensor Program Abstraction \u2013 TensorIR": [[9, "learning-one-tensor-program-abstraction-tensorir"]], "Function Parameters and Buffers": [[9, "function-parameters-and-buffers"]], "For Loop Iterations": [[9, "for-loop-iterations"]], "Computational Block": [[9, "computational-block"]], "Block Axis Properties": [[9, "block-axis-properties"]], "Why Extra Information in Block": [[9, "why-extra-information-in-block"]], "Sugar for Block Axes Binding": [[9, "sugar-for-block-axes-binding"]], "Function Attributes and Decorators": [[9, "function-attributes-and-decorators"]], "Section Checkpoint": [[9, "section-checkpoint"], [2, "section-checkpoint"], [0, "section-checkpoint"]], "Transformation": [[9, "transformation"]], "Getting to Another Variant": [[9, "getting-to-another-variant"]], "Section Summary and Discussions": [[9, "section-summary-and-discussions"]], "Exercise": [[9, "exercise"]], "Ways to Create and Interact with TensorIR": [[9, "ways-to-create-and-interact-with-tensorir"]], "Create TensorIR via TVMScript": [[9, "create-tensorir-via-tvmscript"]], "Generate TensorIR code using Tensor Expression": [[9, "generate-tensorir-code-using-tensor-expression"]], "TensorIR Functions as Results of Transformations": [[9, "tensorir-functions-as-results-of-transformations"]], "Primitive Tensor Function": [[11, "primitive-tensor-function"]], "Tensor Program Abstraction": [[11, "tensor-program-abstraction"], [10, "tensor-program-abstraction"]], "Extra Structure in Tensor Program Abstraction": [[11, "extra-structure-in-tensor-program-abstraction"]], "Machine Learning Compiler": [[13, "machine-learning-compiler"]], "Exercises for TensorIR": [[12, "exercises-for-tensorir"]], "Section 1: How to Write TensorIR": [[12, "section-1-how-to-write-tensorir"]], "Example: Element-wise Add": [[12, "example-element-wise-add"]], "Exercise 1: Broadcast Add": [[12, "exercise-1-broadcast-add"]], "Exercise 2: 2D Convolution": [[12, "exercise-2-2d-convolution"]], "Section 2: How to Transform TensorIR": [[12, "section-2-how-to-transform-tensorir"]], "Parallel, Vectorize and Unroll": [[12, "parallel-vectorize-and-unroll"]], "Exercise 3: Transform a batch matmul program": [[12, "exercise-3-transform-a-batch-matmul-program"]], "Build and Evaluate": [[12, "build-and-evaluate"]], "End to End Model Execution": [[2, "end-to-end-model-execution"]], "Load the Dataset": [[2, "load-the-dataset"]], "Download Model Parameters": [[2, "download-model-parameters"]], "End to End Model Integration": [[2, "end-to-end-model-integration"]], "Constructing an End to End IRModule in TVMScript": [[2, "constructing-an-end-to-end-irmodule-in-tvmscript"]], "Computational Graph View": [[2, "computational-graph-view"]], "call_dps_packed Construct": [[2, "call-dps-packed-construct"]], "Dataflow Block": [[2, "dataflow-block"]], "Build and Run the Model": [[2, "build-and-run-the-model"]], "Integrate Existing Libraries in the Environment": [[2, "integrate-existing-libraries-in-the-environment"]], "Registering Runtime Function": [[2, "registering-runtime-function"]], "Mixing TensorIR Code and Libraries": [[2, "mixing-tensorir-code-and-libraries"]], "Bind Parameters to IRModule": [[2, "bind-parameters-to-irmodule"]], "Automatic Program Optimization": [[0, "automatic-program-optimization"]], "Recap: Transform a Primitive Tensor Function.": [[0, "recap-transform-a-primitive-tensor-function"]], "Transformation Trace": [[0, "transformation-trace"]], "Stochastic Schedule Transformation": [[0, "stochastic-schedule-transformation"]], "Deep Dive into Stochastic Transformation": [[0, "deep-dive-into-stochastic-transformation"]], "Search Over Stochastic Transformations": [[0, "search-over-stochastic-transformations"]], "Leverage Default AutoScheduling": [[0, "leverage-default-autoscheduling"]], "Putting Things Back to End to End Model Execution": [[0, "putting-things-back-to-end-to-end-model-execution"]], "MLC Assignment 1: End-to-End Model Execution": [[1, "mlc-assignment-1-end-to-end-model-execution"]], "Section 1: Model Preparation": [[1, "section-1-model-preparation"]], "Section 2. Ingest Model From Pytorch": [[1, "section-2-ingest-model-from-pytorch"]], "Section 3. Use of Vendor Library": [[1, "section-3-use-of-vendor-library"]], "Section 4. Transformation in End-to\u2013End Models": [[1, "section-4-transformation-in-end-toend-models"]]}, "indexentries": {}})