# Dependencies

In [3]:
from manim import *
import numpy as np
import math
import matplotlib.pyplot as plt

from manim.mobject.graphing.scale import LogBase

In [4]:
def remove_invisible_chars(mobject: SVGMobject) -> SVGMobject:
    """Function to remove unwanted invisible characters from some mobjects.

    Parameters
    ----------
    mobject
        Any SVGMobject from which we want to remove unwanted invisible characters.

    Returns
    -------
    :class:`~.SVGMobject`
        The SVGMobject without unwanted invisible characters.
    """
    # TODO: Refactor needed
    iscode = False
    if mobject.__class__.__name__ == "Text":
        mobject = mobject[:]
    elif mobject.__class__.__name__ == "Code":
        iscode = True
        code = mobject
        mobject = mobject.code
    mobject_without_dots = VGroup()
    if mobject[0].__class__ == VGroup:
        for i in range(len(mobject)):
            mobject_without_dots.add(VGroup())
            mobject_without_dots[i].add(*(k for k in mobject[i] if k.__class__ != Dot))
    else:
        mobject_without_dots.add(*(k for k in mobject if k.__class__ != Dot))
    if iscode:
        code.code = mobject_without_dots
        return code
    return mobject_without_dots

# Motivation

In [4]:
# Matrix Multiplications
X0 = np.round(np.random.rand(4,4), 1)
W0 = np.round(np.random.rand(4,4), 1)
X1 = np.round(X0 @ W0, 1)
W1 = np.round(np.random.rand(4,4), 1)
X2 = np.round(X1 @ W1, 1)
W2 = np.round(np.random.rand(4,4), 1)
X3 = np.round(X2 @ W2, 1)
W3 = np.round(np.random.rand(4,4), 1)
X4 = np.round(X3 @ W3, 1)
W4 = np.round(np.random.rand(4,4), 1)
X5 = np.round(X4 @ W4, 1)


X0_ = np.round(np.random.rand(10,10), 1)
W0_ = np.round(np.random.rand(10,10), 1)
X1_ = np.round(X0_ @ W0_, 1)

In [207]:
%%manim -qk -v WARNING Motivation

class Motivation(Scene):
    def construct(self):
        self.wait(2)

        X0_mat = Matrix(X0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).to_edge(LEFT).shift(0.25*(RIGHT)+0.5*DOWN)

        X1_mat = Matrix(X1,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).next_to(X0_mat, RIGHT)

        W0_mat = Matrix(W0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN).set_column_colors(GREEN, GREEN, GREEN, GREEN).scale(0.42).next_to(X1_mat, UP)

        X2_mat = Matrix(X2,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).next_to(X1_mat, RIGHT)

        W1_mat = Matrix(W1,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN).set_column_colors(GREEN, GREEN, GREEN, GREEN).scale(0.42).next_to(X2_mat, UP)

        X3_mat = Matrix(X3,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).next_to(X2_mat, RIGHT)

        W2_mat = Matrix(W2,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN).set_column_colors(GREEN, GREEN, GREEN, GREEN).scale(0.42).next_to(X3_mat, UP)

        X4_mat = Matrix(X4,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).next_to(X3_mat, RIGHT)

        W3_mat = Matrix(W3,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN).set_column_colors(GREEN, GREEN, GREEN, GREEN).scale(0.42).next_to(X4_mat, UP)

        X5_mat = Matrix(X5,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.42).next_to(X4_mat, RIGHT)

        W4_mat = Matrix(W4,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN).set_column_colors(GREEN, GREEN, GREEN, GREEN).scale(0.42).next_to(X5_mat, UP)

        title = Title("Neural Network = Series of Matrix Mulitplications")
        self.play(Write(title), Create(VGroup(X0_mat, W0_mat)))
        self.play(Create(X1_mat[1:]), run_time=0.5)
        self.play(Create(X1_mat[0]), run_time=3)
        self.play(Create(W1_mat), VGroup(X0_mat, W0_mat).animate.set_opacity(0.25))
        self.play(Create(X2_mat[1:]), run_time=0.5)
        self.play(Create(X2_mat[0]), run_time=3)
        self.play(Create(W2_mat), VGroup(X1_mat, W1_mat).animate.set_opacity(0.25))
        self.play(Create(X3_mat[1:]), run_time=0.5)
        self.play(Create(X3_mat[0]), run_time=3)
        self.play(Create(W3_mat), VGroup(X2_mat, W2_mat).animate.set_opacity(0.25))
        self.play(Create(X4_mat[1:]), run_time=0.5)
        self.play(Create(X4_mat[0]), run_time=3)
        self.play(Create(W4_mat), VGroup(X3_mat, W3_mat).animate.set_opacity(0.25))
        self.play(Create(X5_mat[1:]), run_time=0.5)
        self.play(Create(X5_mat[0]), run_time=3)
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

        title = Title("Power of Neural Net = Size of Matrices")
        text_1 = Tex(r"Matrices of size 10").next_to(title, DOWN)
        
        X0_mat = Matrix(X0_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.25).to_edge(DOWN).shift(1.5*LEFT+0.25*UP)

        X1_mat = Matrix(X1_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.25).next_to(X0_mat, RIGHT)

        W0_mat = Matrix(W0_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).set_row_colors(GREEN, GREEN, GREEN, GREEN, GREEN, GREEN, GREEN, GREEN, GREEN, GREEN).scale(0.25).next_to(X1_mat, UP)

        self.play(Write(title), Create(VGroup(X0_mat, W0_mat, X1_mat[1:])))
        self.wait(1)
        self.play(Write(text_1), Write(X1_mat[0]))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

        title = Title("Computational Cost")
        self.play(Write(title))
        self.wait(1)

        N = [1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000]
        cpu_times = np.array([2365712, 7728049, 19107392, 48998530, 85209923, 155137907, 194911457, 352963378, 440474379, 652339182, 713283468, 1071591118, 1260187076, 1610661108, 1680444971])/10**6
        gpu_times = np.array([332321, 10930, 35959, 57451, 65592, 88862, 118317, 183036, 196658, 247056, 316205, 421784, 469413, 548058, 627282])/10**6
        axes = Axes(
                    x_range=[0, max(N), 1000],
                    y_range=[0, max(cpu_times) + 1, 500],
                    axis_config={"color": WHITE},
                    tips=False
                ).scale(0.75).shift(0.5*DOWN)
        axes.add_coordinates()
        plot_cpu = axes.plot_line_graph(x_values=N,
                                    y_values=cpu_times,
                                    line_color=RED,
                                    vertex_dot_style=dict(stroke_width=3, fill_color=RED),
                                    stroke_width=5
                                )
        plot_gpu = axes.plot_line_graph(x_values=N,
                                    y_values=gpu_times,
                                    line_color=GREEN,
                                    vertex_dot_style=dict(stroke_width=3, fill_color=GREEN),
                                    stroke_width=10
                                )
        x_label = axes.get_x_axis_label(r"\text{Matrix Size}")
        y_label = axes.get_y_axis_label(r"\text{Time (in seconds)}")

        self.play(Create(axes), Create(x_label), Create(y_label))
        self.play(Write(plot_cpu), run_time=1)
        self.wait(1)

        text_2 =Tex(r"CPU", color=RED).scale(0.85).next_to(plot_cpu, UP).shift(2*RIGHT+DOWN)
        text_3 =Tex(r"GPU", color=GREEN).scale(0.85).next_to(plot_gpu, UP).shift(2*RIGHT)
        self.play(Write(plot_gpu), Write(text_2), Write(text_3))
        self.wait(1)

        title_ = Title("Computational Cost for Matrix of size 8000")
        self.play(ReplacementTransform(title, title_), FadeOut(axes, plot_cpu, plot_gpu, x_label, y_label, text_2, text_3))
        self.wait(1)

        chart = BarChart(
            values=[cpu_times[-1], gpu_times[-1]],
            bar_names=["CPU", "GPU"],
            y_range=[0, 2000, 100],
            y_length=6,
            x_length=10,
            x_axis_config={"font_size": 36},
        ).scale(0.9).to_edge(DOWN)

        # Different parts of BarChart
        x_axis = chart.x_axis
        y_axis = chart.y_axis
        bar_names = chart.x_axis.labels
        bars = chart.bars
        bars[0].color = RED
        bars[1].color = GREEN
        c_bar_lbls = chart.get_bar_labels(font_size=48)

        self.play(Create(bar_names[0]), Create(bar_names[1]))
        self.play(Create(bars[0]))
        self.play(Write(c_bar_lbls[0]))
        self.play(Create(bars[1]))
        self.play(Write(c_bar_lbls[1]))
        self.wait(1)

        text_5 = Tex(r"GPU is 2678 times faster than CPU!")
        self.play(ReplacementTransform(VGroup(bar_names, bars, c_bar_lbls), text_5))
        self.wait(1)

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

        # 4 Questions
        title = Title("4 simple questions...")
        self.play(Write(title))
        self.wait(1)

        questions = BulletedList("What is the big-picture difference between a CPU and a GPU?",
                                "Why do applications run faster on a GPU, and can a GPU speed up anything?",
                                "How can we write our own matrix multiplication code that runs on a GPU?",
                                "Are we getting the best out of the GPU?").scale(0.75)
        self.play(Write(questions[0]))
        self.wait(1)
        self.play(Write(questions[1]))
        self.wait(1)
        self.play(Write(questions[2]))
        self.wait(1)
        self.play(Write(questions[3]))
        self.wait(1)

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                                                                                                               

# Introduction

In [209]:
%%manim -qk -v WARNING Matmul_Algorithm

class Matmul_Algorithm(Scene):
    def construct(self):
        # Explain matrix
        A = Matrix([
            ["a_{0,0}", "a_{0,1}", "\\cdots", "a_{0,n-1}"],
            ["a_{1,0}", "a_{1,1}", "\\cdots", "a_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["a_{n-1,0}", "a_{n-1,1}", "\\cdots", "a_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5)

        self.play(Write(A))
        self.wait(1)

        entries = A.get_entries()
        row_0 = Tex(r"0").scale(0.75).next_to(entries[0].get_center(), LEFT).shift(LEFT)
        row_1 = Tex(r"1").scale(0.75).next_to(entries[4].get_center(), LEFT).shift(LEFT)
        row_k = Tex(r"$\vdots$").scale(0.75).next_to(entries[8].get_center(), LEFT).shift(LEFT)
        row_n1 = Tex(r"n-1").scale(0.75).next_to(entries[12].get_center(), LEFT).shift(LEFT)
        text_1 = Tex(r"Row Index").scale(0.75).next_to(VGroup(row_0, row_1, row_k, row_n1), LEFT)
        self.play(Write(text_1), Write(row_0), Write(row_1), Write(row_k), Write(row_n1))
        self.wait(1)

        col_0 = Tex(r"0").scale(0.75).next_to(entries[0].get_center(), UP).shift(0.5*UP)
        col_1 = Tex(r"1").scale(0.75).next_to(entries[1].get_center(), UP).shift(0.5*UP)
        col_k = Tex(r"$\cdots$").scale(0.75).next_to(entries[2].get_center(), UP).shift(0.5*UP)
        col_n1 = Tex(r"n-1").scale(0.75).next_to(entries[3].get_center(), UP).shift(0.5*UP)
        text_2 = Tex(r"Column Index").scale(0.75).scale(0.75).next_to(VGroup(col_0, col_1, col_k, col_n1), UP)
        self.play(Write(text_2), Write(col_0), Write(col_1), Write(col_k), Write(col_n1))
        self.wait(1)

        self.play(FadeOut(text_1, row_0, row_1, row_k, row_n1, text_2, col_0, col_1, col_k, col_n1))

        # Matrix Multiplication
        self.play(A.animate.scale(0.75).to_edge(DOWN).to_edge(LEFT).shift(0.5*(UP+RIGHT)))

        C = Matrix([
            ["c_{0,0}", "c_{0,1}", "\\cdots", "c_{0,n-1}"],
            ["c_{1,0}", "c_{1,1}", "\\cdots", "c_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["c_{n-1,0}", "c_{n-1,1}", "\\cdots", "c_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(A, RIGHT).shift(0.5*RIGHT)

        B = Matrix([
            ["b_{0,0}", "b_{0,1}", "\\cdots", "b_{0,n-1}"],
            ["b_{1,0}", "b_{1,1}", "\\cdots", "b_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["b_{n-1,0}", "b_{n-1,1}", "\\cdots", "b_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(C, UP).shift(0.5*UP)

        self.play(Write(B))
        self.wait(1)
        self.play(Write(C[1:]))
        self.wait(1)

        rect_0 = SurroundingRectangle(C.get_entries()[0])
        rect_1 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_2 = SurroundingRectangle(B.get_columns()[0], color=GREEN)

        self.play(Create(rect_0))
        self.wait(1)
        self.play(Create(rect_1))
        self.wait(1)
        self.play(Create(rect_2))
        self.wait(1)
        
        A_row0 = Matrix([
            ["a_{0,0}"],
            ["a_{0,1}"],
            ["\\vdots"],
            ["a_{0,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(A, UP).shift(0.5*UP+LEFT)
        A_row0[1:].color = BLUE

        B_col0 = Matrix([
            ["b_{0,0}"],
            ["b_{1,0}"],
            ["\\vdots"],
            ["b_{n-1,0}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(A, UP).shift(0.5*UP+RIGHT)
        B_col0[1:].color = GREEN
        self.play(ReplacementTransform(rect_1, A_row0), ReplacementTransform(rect_2, B_col0))
        self.wait(1)

        AB = Matrix([
            ["a_{0,0} \\cdot b_{0,0}"],
            ["a_{0,1} \\cdot b_{1,0}"],
            ["\\vdots"],
            ["a_{0,n-1} \\cdot b_{n-1,0}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(A, UP).shift(0.5*UP)
        AB[1:].color = YELLOW

        self.play(ReplacementTransform(VGroup(A_row0, B_col0), AB))
        self.wait(1)

        C_val = Matrix([
            ["a_{0,0} \\cdot b_{0,0}", "+", "\\cdots", "+", "a_{0,n-1} \\cdot b_{n-1,0}"],
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=2).scale(0.55).next_to(A, UP).shift(2*UP)
        C_val[1:].color = YELLOW

        self.play(ReplacementTransform(AB, C_val))
        self.wait(1)

        self.play(FadeOut(rect_0), ReplacementTransform(C_val, C[0][0]))

        for i in range(4):
            for j in range(4):
                if i == 0 and j == 0:
                    continue
                else:
                    rect_1 = SurroundingRectangle(A.get_rows()[i], color=BLUE)
                    rect_2 = SurroundingRectangle(B.get_columns()[j], color=GREEN)
                    self.play(Create(rect_1), Create(rect_2))
                    self.play(ReplacementTransform(VGroup(rect_1, rect_2), C[0][i*4+j]))

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                               

In [210]:
%%manim -qk -v WARNING Matmul_Code

class Matmul_Code(Scene):
    def construct(self):
        # Code
        matmul_code = Code(file_name="cpu_mat_mul.c", language="C", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.4).to_edge(RIGHT)
        matmul_code.code = remove_invisible_chars(matmul_code.code)
        self.play(Create(matmul_code[0]))
        self.wait(1)

        
        A = Matrix([
            ["a_{0,0}", "a_{0,1}", "\\cdots", "a_{0,n-1}"],
            ["a_{1,0}", "a_{1,1}", "\\cdots", "a_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["a_{n-1,0}", "a_{n-1,1}", "\\cdots", "a_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).to_edge(DOWN).to_edge(LEFT).shift(0.5*(UP+RIGHT)+0.75*UP)

        C = Matrix([
            ["c_{0,0}", "c_{0,1}", "\\cdots", "c_{0,n-1}"],
            ["c_{1,0}", "c_{1,1}", "\\cdots", "c_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["c_{n-1,0}", "c_{n-1,1}", "\\cdots", "c_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).next_to(A, RIGHT).shift(0.5*RIGHT)

        B = Matrix([
            ["b_{0,0}", "b_{0,1}", "\\cdots", "b_{0,n-1}"],
            ["b_{1,0}", "b_{1,1}", "\\cdots", "b_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["b_{n-1,0}", "b_{n-1,1}", "\\cdots", "b_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).next_to(C, UP).shift(0.5*UP)

        self.play(Write(A), Write(B), Write(C[1:]), Write(matmul_code.code[:2]))
        self.wait(1)

        # CPU sketch
        CPU_cores = VGroup()
        for i in range(2):
            for j in range(2):
                if i == 0 and j == 0:
                    CPU_cores.add(Square(side_length=0.5, color=BLUE))
                elif j == 0:
                    idx = (i-1)*2
                    CPU_cores.add(Square(side_length=0.5, color=BLUE).next_to(CPU_cores[idx], DOWN))
                else:
                    idx = i*2 + j - 1
                    CPU_cores.add(Square(side_length=0.5, color=BLUE).next_to(CPU_cores[idx], RIGHT))
        box = SurroundingRectangle(CPU_cores, color=GREEN, buff=0.2)
        CPU = VGroup(CPU_cores, box).next_to(A, UP).shift(1*UP)
        text_1 = Tex(r"CPU Cores", color=BLUE).scale(0.75).next_to(CPU, DOWN)
        self.play(Create(CPU), Write(text_1))
        self.wait(1)
        CPU_thread = Arrow(start=0.9*UP, end=0.9*DOWN, color=BLUE_A).move_to(box)
        text_2 = Tex(r"CPU Thread", color=BLUE_A).scale(0.75).next_to(CPU, DOWN)
        self.play(ReplacementTransform(CPU_cores, CPU_thread), ReplacementTransform(text_1, text_2))
        self.wait(1)

        cpu_thread = Arrow(start=0.2*UP, end=0.2*DOWN, color=BLUE_A).move_to(C.get_entries()[0])
        self.play(Create(cpu_thread))
        self.wait(1)

        rect_1 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_2 = SurroundingRectangle(B.get_columns()[0], color=GREEN)
        self.play(Write(matmul_code.code[2:8]), Create(rect_1), Create(rect_2))
        self.wait(1)

        circ_k1 = Circle(radius=0.25).move_to(A.get_entries()[0])
        circ_k2 = Circle(radius=0.25).move_to(B.get_entries()[0])
        self.play(Write(matmul_code.code[8:16]), Create(circ_k1), Create(circ_k2))
        self.wait(1)
        for i in range(1, 4):
            self.play(circ_k1.animate.move_to(A.get_entries()[i]), circ_k2.animate.move_to(B.get_entries()[i*4]))
        self.play(ReplacementTransform(VGroup(circ_k1, circ_k2), C[0][0]), Write(matmul_code.code[16:]))
        self.wait(1)

        for i in range(4):
            for j in range(4):
                if i == 0 and j == 0:
                    continue
                else:
                    
                    self.play(cpu_thread.animate.move_to(C.get_entries()[i*4+j]), rect_1.animate.move_to(A.get_rows()[i]), rect_2.animate.move_to(B.get_columns()[j]))
                    circ_k1 = Circle(radius=0.25).move_to(A.get_entries()[i*4])
                    circ_k2 = Circle(radius=0.25).move_to(B.get_entries()[j])
                    self.play(Create(circ_k1), Create(circ_k2))
                    for k in range(4):
                        self.play(circ_k1.animate.move_to(A.get_entries()[i*4+k]), circ_k2.animate.move_to(B.get_entries()[k*4+j]))

                    self.play(ReplacementTransform(VGroup(circ_k1, circ_k2), C[0][i*4+j]))

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                    

In [6]:
%%manim -qk -v WARNING Matmul_Analysis_P1

class Matmul_Analysis_P1(Scene):
    def construct(self):
        N = [1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000]
        cpu_times = [2.365712, 7.728049, 19.107392, 48.998530, 85.209923, 155.137907, 194.911457, 352.963378, 440.474379, 652.339182, 713.283468, 1071.591118, 1260.187076, 1610.661108, 1680.444971]

        axes = Axes(
                    x_range=[0, max(N), 1000],
                    y_range=[0, 2000, 500],
                    axis_config={"color": WHITE},
                    tips=False
                ).scale(0.95)#.shift(0.5*DOWN)
        axes.add_coordinates()
        plot_cpu = axes.plot_line_graph(x_values=N,
                                    y_values=cpu_times,
                                    line_color=RED,
                                    vertex_dot_style=dict(stroke_width=3, fill_color=RED),
                                    stroke_width=5
                                )
        x_label = axes.get_x_axis_label(r"\text{Matrix Size}")
        y_label = axes.get_y_axis_label(r"\text{Time (in seconds)}")

        self.play(Create(axes), Create(x_label), Create(y_label))
        self.play(Write(plot_cpu), run_time=1)
        self.wait(1)
        text_1 = Tex(r"30 minutes!", color=RED).scale(0.75).next_to(plot_cpu, UP).shift(5*RIGHT)
        self.play(Write(text_1))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                                                                                                      

In [7]:
%%manim -qk -v WARNING Matmul_Analysis_P2

class Matmul_Analysis_P2(Scene):
    def construct(self):
        A = Matrix([
            ["a_{0,0}", "a_{0,1}", "\\cdots", "a_{0,n-1}"],
            ["a_{1,0}", "a_{1,1}", "\\cdots", "a_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["a_{n-1,0}", "a_{n-1,1}", "\\cdots", "a_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).to_edge(DOWN).to_edge(LEFT).shift(0.5*(UP+RIGHT))

        C = Matrix([
            ["c_{0,0}", "c_{0,1}", "\\cdots", "c_{0,n-1}"],
            ["c_{1,0}", "c_{1,1}", "\\cdots", "c_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["c_{n-1,0}", "c_{n-1,1}", "\\cdots", "c_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(A, RIGHT).shift(0.5*RIGHT)

        B = Matrix([
            ["b_{0,0}", "b_{0,1}", "\\cdots", "b_{0,n-1}"],
            ["b_{1,0}", "b_{1,1}", "\\cdots", "b_{1,n-1}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["b_{n-1,0}", "b_{n-1,1}", "\\cdots", "b_{n-1,n-1}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.75).next_to(C, UP).shift(0.5*UP)

        self.play(Write(A), Write(B), Write(C[1:]))
        self.wait(1)

        rect_0 = SurroundingRectangle(C.get_rows()[0])
        self.play(Create(rect_0))
        self.wait(1)
        rect_10 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_11 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_12 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_13 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_20 = SurroundingRectangle(B.get_columns()[0], color=GREEN)
        rect_21 = SurroundingRectangle(B.get_columns()[1], color=GREEN)
        rect_22 = SurroundingRectangle(B.get_columns()[2], color=GREEN)
        rect_23 = SurroundingRectangle(B.get_columns()[3], color=GREEN)

        self.play(Create(VGroup(rect_10, rect_11, rect_12, rect_13, rect_20, rect_21, rect_22, rect_23)), FadeOut(rect_0))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(rect_10, rect_20), C[0][0]),
                ReplacementTransform(VGroup(rect_11, rect_21), C[0][1]),
                ReplacementTransform(VGroup(rect_12, rect_22), C[0][2]),
                ReplacementTransform(VGroup(rect_13, rect_23), C[0][3]))
        self.wait(1)

        rect_10 = SurroundingRectangle(A.get_rows()[1], color=BLUE)
        rect_11 = SurroundingRectangle(A.get_rows()[2], color=BLUE)
        rect_12 = SurroundingRectangle(A.get_rows()[3], color=BLUE)
        rect_20 = SurroundingRectangle(B.get_columns()[0], color=GREEN)
        rect_21 = SurroundingRectangle(B.get_columns()[1], color=GREEN)
        rect_22 = SurroundingRectangle(B.get_columns()[2], color=GREEN)
        rect_23 = SurroundingRectangle(B.get_columns()[3], color=GREEN)

        self.play(Create(VGroup(rect_10, rect_11, rect_12, rect_20, rect_21, rect_22, rect_23)))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(rect_20), C[0][4:8]),
                ReplacementTransform(VGroup(rect_10, rect_21), C[0][8:12]),
                ReplacementTransform(VGroup(rect_11, rect_22), C[0][12:16]),
                ReplacementTransform(VGroup(rect_12, rect_23), C[0][16:]))
        self.wait(1)

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                  

In [9]:
%%manim -qk -v WARNING Matmul_Analysis_P3

class Matmul_Analysis_P3(Scene):
    def construct(self):
        title = Title("GPU")
        self.play(Write(title))
        self.wait(1)

        # GPU sketch
        GPU_cores = VGroup()
        for i in range(10):
            for j in range(10):
                if i == 0 and j == 0:
                    GPU_cores.add(Square(side_length=0.2, color=RED))
                elif j == 0:
                    idx = (i-1)*10
                    GPU_cores.add(Square(side_length=0.2, color=RED).next_to(GPU_cores[idx], DOWN))
                else:
                    idx = i*10 + j - 1
                    GPU_cores.add(Square(side_length=0.2, color=RED).next_to(GPU_cores[idx], RIGHT))
        box = SurroundingRectangle(GPU_cores, color=GREEN, buff=0.3)
        GPU = VGroup(GPU_cores, box).move_to(ORIGIN)
        text_1 = Tex(r"GPU Cores", color=RED).next_to(GPU, DOWN)
        self.play(Create(GPU), Write(text_1))
        self.wait(1)
        self.play(VGroup(GPU, text_1).animate.to_edge(RIGHT), FadeOut(title))
        self.wait(1)

        # GPU Threads
        GPU_threads = VGroup()
        for i in range(19):
            for j in range(46):
                if i == 0 and j == 0:
                    GPU_threads.add(Arrow(start=0.1*UP, end=0.1*DOWN, color=ORANGE).move_to(GPU_cores[0]).shift(0.22*LEFT+0.23*UP))
                elif j == 0:
                    idx = (i-1)*46
                    GPU_threads.add(Arrow(start=0.1*UP, end=0.1*DOWN, color=ORANGE).next_to(GPU_threads[idx], DOWN).shift(0.2*UP))
                else:
                    idx = i*46 + j - 1
                    GPU_threads.add(Arrow(start=0.1*UP, end=0.1*DOWN, color=ORANGE).next_to(GPU_threads[idx], RIGHT).shift(0.2*LEFT))
        title = Title("GPU can spawn millions of threads")
        text_3 = Tex(r"GPU Threads", color=ORANGE).next_to(GPU, DOWN)
        self.play(Write(title), ReplacementTransform(GPU_cores, GPU_threads), ReplacementTransform(text_1, text_3))
        self.wait(1)
        text_2 = Tex(r"Threads work independently in parallel").scale(0.9).to_edge(LEFT)
        self.play(Write(text_2))
        self.wait(1)
        self.play(FadeOut(text_2))
        self.wait(1)

        # Parallel matmul algo
        A = Matrix([
            ["a_{0,0}", "a_{0,1}", "\\cdots", "a_{0,999}"],
            ["a_{1,0}", "a_{1,1}", "\\cdots", "a_{1,999}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["a_{999,0}", "a_{999,1}", "\\cdots", "a_{999,999}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).to_edge(DOWN).to_edge(LEFT).shift(0.5*(UP+RIGHT)+0.75*UP)

        C = Matrix([
            ["c_{0,0}", "c_{0,1}", "\\cdots", "c_{0,999}"],
            ["c_{1,0}", "c_{1,1}", "\\cdots", "c_{1,999}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["c_{999,0}", "c_{999,1}", "\\cdots", "c_{999,999}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).next_to(A, RIGHT).shift(0.5*RIGHT)

        B = Matrix([
            ["b_{0,0}", "b_{0,1}", "\\cdots", "b_{0,999}"],
            ["b_{1,0}", "b_{1,1}", "\\cdots", "b_{1,999}"],
            ["\\vdots", "\\vdots", "\\ddots", "\\vdots"],
            ["b_{999,0}", "b_{999,1}", "\\cdots", "b_{999,999}"]
        ],
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1.5).scale(0.5).next_to(C, UP).shift(0.5*UP)

        self.play(Write(A), Write(B), Write(C[1:]))
        self.wait(1)
        matrix_threads = VGroup()
        for i in range(4):
            for j in range(4):
                if i == 2 or j == 2:
                    continue
                else:
                    matrix_threads.add(Arrow(start=0.15*UP, end=0.15*DOWN, color=ORANGE).move_to(C.get_entries()[i*4+j]))
        self.play(Create(matrix_threads), Write(VGroup(C.get_entries()[2], C.get_entries()[6], C.get_entries()[8:12], C.get_entries()[14])))
        self.wait(1)
        rect_1 = SurroundingRectangle(A.get_rows()[0], color=BLUE)
        rect_2 = SurroundingRectangle(B.get_columns()[0], color=GREEN)
        self.play(Create(rect_1), Create(rect_2))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(rect_1, rect_2, matrix_threads[0]), C.get_entries()[0]))
        self.wait(1)
        rect_1 = SurroundingRectangle(A.get_rows(), color=BLUE)
        rect_2 = SurroundingRectangle(B.get_columns(), color=GREEN)
        self.play(Create(rect_1), Create(rect_2))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(rect_1, rect_2, matrix_threads[1:]), C.get_entries()[1:]))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                       

In [10]:
%%manim -qk -v WARNING CPU_v_GPU

class CPU_v_GPU(Scene):
    def construct(self):
        title = Title("CPU vs GPU")
        self.play(Write(title))
        self.wait(1)
        hardware = ImageMobject("GPUvCPU.png").scale(0.75)
        self.play(FadeIn(hardware))
        self.wait(1)
        title_ = Title("CPU moves data and assigns tasks to the GPU")
        self.play(ReplacementTransform(title, title_))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                                 

# Parallel Matrix Multiplication

In [11]:
%%manim -qk -v WARNING CUDA

class CUDA(Scene):
    def construct(self):
        # Title
        title = Title("Compute Unified Device Architecture")
        self.play(Write(title))
        
        # CPU
        cpu_cores = Rectangle(height=3, width=3, color=BLUE).shift(4*LEFT)
        text_1 = Tex(r"CPU", color=BLUE).scale(0.75).next_to(cpu_cores, UP)
        RAM = Rectangle(height=1, width=3, color=BLUE_A).next_to(cpu_cores, DOWN)
        text_2 = Tex(r"RAM", color=BLUE_A).scale(0.75).next_to(RAM, DOWN)
        CPU = VGroup(cpu_cores, RAM, text_1, text_2)
        self.play(Create(CPU))
        self.wait(1)
        
        # GPU
        gpu_cores = Rectangle(height=3, width=3, color=GREEN).next_to(cpu_cores, RIGHT)
        text_3 = Tex(r"GPU", color=GREEN).scale(0.75).next_to(gpu_cores, UP)
        VRAM = Rectangle(height=1, width=3, color=GREEN_A).next_to(gpu_cores, DOWN)
        text_4 = Tex(r"VRAM", color=GREEN_A).scale(0.75).next_to(VRAM, DOWN)
        GPU = VGroup(gpu_cores, VRAM, text_3, text_4)
        self.play(Create(GPU))
        self.wait(1)
        
        # Code
        main_code = Code(file_name="main.cu", language="CUDA", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.35).next_to(title, DOWN).to_edge(RIGHT)
        main_code.code = remove_invisible_chars(main_code.code)
        cpu_thread = Arrow(start=1*UP, end=1*DOWN, color=BLUE_B).move_to(cpu_cores)
        self.play(Create(main_code[0]), Write(main_code.code[:10]), Create(cpu_thread))
        self.wait(1)
        h_A = Tex(r"A", color=BLUE_A).scale(0.5)
        h_A_box = SurroundingRectangle(h_A, color=BLUE_A, buff=0.1)
        A_h = VGroup(h_A, h_A_box).move_to(RAM).shift(LEFT)
        h_B = Tex(r"B", color=BLUE_A).scale(0.5)
        h_B_box = SurroundingRectangle(h_B, color=BLUE_A, buff=0.1)
        B_h = VGroup(h_B, h_B_box).move_to(RAM)
        h_C = Tex(r"C", color=BLUE_A).scale(0.5)
        h_C_box = SurroundingRectangle(h_C, color=BLUE_A, buff=0.1)
        C_h = VGroup(h_C, h_C_box).move_to(RAM).shift(RIGHT)
        self.play(Write(main_code.code[10:25]))
        self.play(Create(A_h), Create(B_h), Create(h_C_box))
        self.wait(1)
        d_A = Tex(r"A", color=GREEN_A).scale(0.5)
        d_A_box = SurroundingRectangle(d_A, color=GREEN_A, buff=0.1)
        A_d = VGroup(d_A, d_A_box).move_to(VRAM).shift(LEFT)
        d_B = Tex(r"B", color=GREEN_A).scale(0.5)
        d_B_box = SurroundingRectangle(d_B, color=GREEN_A, buff=0.1)
        B_d = VGroup(d_B, d_B_box).move_to(VRAM)
        self.play(Write(main_code.code[25:27]))
        self.play(Create(d_A_box), Create(d_B_box))
        self.wait(1)
        ram_vram_1 = CurvedArrow(h_A_box.get_edge_center(DOWN), d_A_box.get_edge_center(DOWN), color=BLUE_A)
        ram_vram_2 = CurvedArrow(h_B_box.get_edge_center(DOWN), d_B_box.get_edge_center(DOWN), color=BLUE_A)
        self.play(Write(main_code.code[27:29]))
        self.play(Create(ram_vram_1), Create(ram_vram_2))
        self.play(Create(d_A), Create(d_B))
        self.play(FadeOut(ram_vram_1), FadeOut(ram_vram_2))
        self.wait(1)
        gpu_threads = VGroup(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).move_to(gpu_cores).shift(1.4*LEFT))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[0], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[23], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        text_5 = Tex(r"Parallel Matrix Multiplication").scale(0.4).next_to(gpu_threads, UP)
        text_5_ = Tex(r"(Kernel Function)").scale(0.5).next_to(text_5, UP)
        self.play(Write(main_code.code[29:31]))
        self.play(Create(gpu_threads), Write(VGroup(text_5, text_5_)), cpu_thread.animate.set_opacity(0.3))
        self.wait(1)
        d_C = Tex(r"C", color=GREEN_A).scale(0.5)
        d_C_box = SurroundingRectangle(d_C, color=GREEN_A, buff=0.1)
        C_d = VGroup(d_C, d_C_box).move_to(VRAM).shift(RIGHT)
        self.play(ReplacementTransform(VGroup(text_5, text_5_), C_d))
        self.wait(1)
        self.play(Write(main_code.code[31:]))
        vram_ram = CurvedArrow(d_C_box.get_edge_center(UP), h_C_box.get_edge_center(UP), color=BLUE_A)
        self.play(cpu_thread.animate.set_opacity(1), gpu_threads.animate.set_opacity(0.3), Create(vram_ram), Create(h_C))
        self.wait(1)

        self.play(FadeOut(title, main_code, vram_ram, h_C, gpu_threads, A_d, B_d, C_d))
        self.wait(1)

                                                                                                                   

In [12]:
%%manim -qk -v WARNING Mem_Alloc

class Mem_Alloc(Scene):
    def construct(self):
        # CPU
        cpu_cores = Rectangle(height=3, width=3, color=BLUE).shift(4*LEFT)
        text_1 = Tex(r"CPU", color=BLUE).scale(0.75).next_to(cpu_cores, UP)
        RAM = Rectangle(height=1, width=3, color=BLUE_A).next_to(cpu_cores, DOWN)
        text_2 = Tex(r"RAM", color=BLUE_A).scale(0.75).next_to(RAM, DOWN)
        CPU = VGroup(cpu_cores, RAM, text_1, text_2)
        h_A = Tex(r"A", color=BLUE_A).scale(0.5)
        h_A_box = SurroundingRectangle(h_A, color=BLUE_A, buff=0.1)
        A_h = VGroup(h_A, h_A_box).move_to(RAM).shift(LEFT)
        h_B = Tex(r"B", color=BLUE_A).scale(0.5)
        h_B_box = SurroundingRectangle(h_B, color=BLUE_A, buff=0.1)
        B_h = VGroup(h_B, h_B_box).move_to(RAM)
        h_C = Tex(r"C", color=BLUE_A).scale(0.5)
        h_C_box = SurroundingRectangle(h_C, color=BLUE_A, buff=0.1)
        C_h = VGroup(h_C, h_C_box).move_to(RAM).shift(RIGHT)

        cpu_thread = Arrow(start=1*UP, end=1*DOWN, color=BLUE_B).move_to(cpu_cores)
        
        # GPU
        gpu_cores = Rectangle(height=3, width=3, color=GREEN).next_to(cpu_cores, RIGHT)
        text_3 = Tex(r"GPU", color=GREEN).scale(0.75).next_to(gpu_cores, UP)
        VRAM = Rectangle(height=1, width=3, color=GREEN_A).next_to(gpu_cores, DOWN)
        text_4 = Tex(r"VRAM", color=GREEN_A).scale(0.75).next_to(VRAM, DOWN)
        GPU = VGroup(gpu_cores, VRAM, text_3, text_4)
        d_A = Tex(r"A", color=GREEN_A).scale(0.5)
        d_A_box = SurroundingRectangle(d_A, color=GREEN_A, buff=0.1)
        A_d = VGroup(d_A, d_A_box).move_to(VRAM).shift(LEFT)
        d_B = Tex(r"B", color=GREEN_A).scale(0.5)
        d_B_box = SurroundingRectangle(d_B, color=GREEN_A, buff=0.1)
        B_d = VGroup(d_B, d_B_box).move_to(VRAM)
        d_C = Tex(r"C", color=GREEN_A).scale(0.5)
        d_C_box = SurroundingRectangle(d_C, color=GREEN_A, buff=0.1)
        C_d = VGroup(d_C, d_C_box).move_to(VRAM).shift(RIGHT)

        self.add(CPU, GPU, A_h, B_h, h_C_box, cpu_thread)
        self.wait(1)

        # Title
        title = Title("GPU Memory Allocation")
        self.play(Write(title))
        self.wait(1)

        # Code
        mem_alloc_code = Code(file_name="gpu_mem_alloc.cu", language="CUDA", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.3).next_to(title, DOWN).to_edge(RIGHT)
        mem_alloc_code.code = remove_invisible_chars(mem_alloc_code.code)
        bp = BulletedList("1st parameter: The address of the pointer variable for the select matrix.",
                        "2nd parameter: The size of the data to be allocated (in the number of bytes).").scale(0.35).next_to(mem_alloc_code, DOWN)
    
        self.play(Create(mem_alloc_code[0]), Write(mem_alloc_code.code[14]), Write(mem_alloc_code.code[15][17:27]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[15][27]), Write(mem_alloc_code.code[15][-2:]), Write(mem_alloc_code.code[15][40]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[9:11]), Write(mem_alloc_code.code[15][27:40]), Write(bp[0]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[15][40:-2]), Write(bp[1]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[15][:17]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[16]), Write(mem_alloc_code.code[:8]))
        self.wait(1)
        self.play(Write(mem_alloc_code.code[11:13]), Write(mem_alloc_code.code[17:]))
        self.play(Create(d_A_box), Create(d_B_box), Create(d_C_box))
        self.wait(1)

        self.play(FadeOut(title, mem_alloc_code, bp))
        self.wait(1)

                                                                                                      

In [13]:
%%manim -qk -v WARNING Data_Copy

class Data_Copy(Scene):
    def construct(self):
        # CPU
        cpu_cores = Rectangle(height=3, width=3, color=BLUE).shift(4*LEFT)
        text_1 = Tex(r"CPU", color=BLUE).scale(0.75).next_to(cpu_cores, UP)
        RAM = Rectangle(height=1, width=3, color=BLUE_A).next_to(cpu_cores, DOWN)
        text_2 = Tex(r"RAM", color=BLUE_A).scale(0.75).next_to(RAM, DOWN)
        CPU = VGroup(cpu_cores, RAM, text_1, text_2)
        h_A = Tex(r"A", color=BLUE_A).scale(0.5)
        h_A_box = SurroundingRectangle(h_A, color=BLUE_A, buff=0.1)
        A_h = VGroup(h_A, h_A_box).move_to(RAM).shift(LEFT)
        h_B = Tex(r"B", color=BLUE_A).scale(0.5)
        h_B_box = SurroundingRectangle(h_B, color=BLUE_A, buff=0.1)
        B_h = VGroup(h_B, h_B_box).move_to(RAM)
        h_C = Tex(r"C", color=BLUE_A).scale(0.5)
        h_C_box = SurroundingRectangle(h_C, color=BLUE_A, buff=0.1)
        C_h = VGroup(h_C, h_C_box).move_to(RAM).shift(RIGHT)

        cpu_thread = Arrow(start=1*UP, end=1*DOWN, color=BLUE_B).move_to(cpu_cores)
        
        # GPU
        gpu_cores = Rectangle(height=3, width=3, color=GREEN).next_to(cpu_cores, RIGHT)
        text_3 = Tex(r"GPU", color=GREEN).scale(0.75).next_to(gpu_cores, UP)
        VRAM = Rectangle(height=1, width=3, color=GREEN_A).next_to(gpu_cores, DOWN)
        text_4 = Tex(r"VRAM", color=GREEN_A).scale(0.75).next_to(VRAM, DOWN)
        GPU = VGroup(gpu_cores, VRAM, text_3, text_4)
        d_A = Tex(r"A", color=GREEN_A).scale(0.5)
        d_A_box = SurroundingRectangle(d_A, color=GREEN_A, buff=0.1)
        A_d = VGroup(d_A, d_A_box).move_to(VRAM).shift(LEFT)
        d_B = Tex(r"B", color=GREEN_A).scale(0.5)
        d_B_box = SurroundingRectangle(d_B, color=GREEN_A, buff=0.1)
        B_d = VGroup(d_B, d_B_box).move_to(VRAM)
        d_C = Tex(r"C", color=GREEN_A).scale(0.5)
        d_C_box = SurroundingRectangle(d_C, color=GREEN_A, buff=0.1)
        C_d = VGroup(d_C, d_C_box).move_to(VRAM).shift(RIGHT)
        gpu_threads = VGroup(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).move_to(gpu_cores).shift(1.4*LEFT))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[0], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[23], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        text_5 = Tex(r"Parallel Matrix Multiplication").scale(0.4).next_to(gpu_threads, UP)
        text_5_ = Tex(r"(Kernel Function)").scale(0.5).next_to(text_5, UP)

        self.add(CPU, GPU, A_h, B_h, h_C_box, cpu_thread, d_A_box, d_B_box, d_C_box)
        self.wait(1)

        # Title
        title = Title("Data Transfer")
        self.play(Write(title))
        self.wait(1)
        # Code
        data_tr_code = Code(file_name="data_tr.cu", language="CUDA", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.3).next_to(title, DOWN).to_edge(RIGHT)
        data_tr_code.code = remove_invisible_chars(data_tr_code.code)
        bp = BulletedList("1st parameter: Pointer to destination.",
                        "2nd parameter: Pointer to source.",
                        "3rd parameter: Number of bytes to be copied",
                        "4th parameter: Transfer direction.").scale(0.5).next_to(data_tr_code, DOWN)
        self.play(Create(data_tr_code[0]), Write(data_tr_code.code[0]), Write(data_tr_code.code[1][18:28]), Write(data_tr_code.code[4][18:28]))
        self.wait(1)
        self.play(Write(data_tr_code.code[1][28]), 
                Write(data_tr_code.code[1][32]),
                Write(data_tr_code.code[1][34]), 
                Write(data_tr_code.code[1][52]),
                Write(data_tr_code.code[1][-2:]),
                Write(data_tr_code.code[4][28]), 
                Write(data_tr_code.code[4][32]),
                Write(data_tr_code.code[4][34]), 
                Write(data_tr_code.code[4][52]),
                Write(data_tr_code.code[4][-2:]))
        self.wait(1)
        self.play(Write(bp[0]))
        self.play(Write(data_tr_code.code[1][29:32]), Write(data_tr_code.code[4][29:32]))
        self.wait(1)
        self.play(Write(bp[1]))
        self.play(Write(data_tr_code.code[1][33:34]), Write(data_tr_code.code[4][33:34]))
        self.wait(1)
        self.play(Write(bp[2]))
        self.play(Write(data_tr_code.code[1][35:52]), Write(data_tr_code.code[4][35:52]))
        self.wait(1)
        self.play(Write(bp[3]))
        self.play(Write(data_tr_code.code[1][53:-2]), Write(data_tr_code.code[4][53:-2]))
        self.wait(1)
        self.play(Write(data_tr_code.code[1][:18]), Write(data_tr_code.code[4][:18]), Write(data_tr_code.code[2]), Write(data_tr_code.code[5]))
        self.wait(1)
        ram_vram_1 = CurvedArrow(h_A_box.get_edge_center(DOWN), d_A_box.get_edge_center(DOWN), color=BLUE_A)
        ram_vram_2 = CurvedArrow(h_B_box.get_edge_center(DOWN), d_B_box.get_edge_center(DOWN), color=BLUE_A)
        self.play(Create(ram_vram_1), Create(ram_vram_2))
        self.play(Create(d_A), Create(d_B))
        self.play(FadeOut(ram_vram_1), FadeOut(ram_vram_2))
        self.wait(1)
        self.play(Write(data_tr_code.code[6:9]))
        self.play(Create(gpu_threads), Write(VGroup(text_5, text_5_)), cpu_thread.animate.set_opacity(0.3))
        self.play(ReplacementTransform(VGroup(text_5, text_5_), d_C))
        self.wait(1)
        self.play(Write(data_tr_code.code[9:]))
        self.wait(1)
        vram_ram = CurvedArrow(d_C_box.get_edge_center(UP), h_C_box.get_edge_center(UP), color=BLUE_A)
        self.play(cpu_thread.animate.set_opacity(1), gpu_threads.animate.set_opacity(0.3), Create(vram_ram))
        self.play(Create(h_C))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                                              

In [14]:
X0 = np.round(np.random.rand(10, 10), 1)
W0 = np.round(np.random.rand(10, 10), 1)
X1 = np.round(X0 @ W0, 1)

In [15]:
%%manim -qk -v WARNING Parallel_Execution

class Parallel_Execution(Scene):
    def construct(self):
        C = Matrix(X1,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.35).to_edge(RIGHT).to_edge(DOWN)

        A = Matrix(X0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.35).next_to(C, LEFT)

        B = Matrix(W0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.35).next_to(C, UP)

        # GPU
        gpu_cores = Rectangle(height=3, width=3, color=GREEN)
        text_3 = Tex(r"GPU", color=GREEN).scale(0.75).next_to(gpu_cores, UP)
        VRAM = Rectangle(height=1, width=3, color=GREEN_A).next_to(gpu_cores, DOWN)
        text_4 = Tex(r"VRAM", color=GREEN_A).scale(0.75).next_to(VRAM, DOWN)
        GPU = VGroup(gpu_cores, VRAM, text_3, text_4).next_to(A, UP).shift(5*LEFT+2*DOWN)
        gpu_threads = VGroup(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).move_to(gpu_cores).shift(1.4*LEFT))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[0], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))
        gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[23], DOWN).shift(0.2*UP))
        for i in range(22):
            gpu_threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_B).next_to(gpu_threads[-1], RIGHT).shift(0.25*LEFT))

        C_threads = VGroup()
        for i in range(100):
            C_threads.add(Arrow(start=0.1*UP, end=0.1*DOWN, color=GREEN_B).move_to(C.get_entries()[i]))

        self.play(Create(GPU))
        self.wait(1)
        self.play(Create(A), Create(B), Create(C[1:]))
        self.wait(1)
        self.play(Create(gpu_threads))
        self.play(Create(C_threads))
        self.wait(1)
        self.play(ReplacementTransform(C_threads, C.get_entries()))
        self.wait(1)
        
        text_1 = Tex(r"Which thread works on which element?").scale(0.6).next_to(gpu_cores, RIGHT)
        self.play(Write(text_1))
        self.wait(1)
        self.play(FadeOut(GPU, A, B, C, gpu_threads))
        title = Title("Thread Organization")
        self.play(ReplacementTransform(text_1, title))
        self.wait(1)

                                                                                                                                

In [16]:
X1_ = np.round(np.random.rand(12, 12), 1)

In [17]:
%%manim -qk -v WARNING Thread_Organization

class Thread_Organization(Scene):
    def construct(self):
        title = Title("Thread Organization")
        self.add(title)

        # 1D Grid and Block
        grid_1d = Rectangle(width=13, height=6, color=GREEN).shift(0.5*DOWN)
        blocks_1d = VGroup(Rectangle(width=3, height=3, color=RED))
        for i in range(3):
            blocks_1d.add(Rectangle(width=3, height=3, color=RED).next_to(blocks_1d[-1], RIGHT))
            
        blocks_1d.move_to(grid_1d)
        blocks_1d_idx = VGroup()
        threads_1d = VGroup()
        threads_1d_idx = VGroup()
        for (i,block) in enumerate(blocks_1d):
            blocks_1d_idx.add(Tex(r"(0,0,{})".format(i)).scale(0.75).next_to(blocks_1d[i], UP))
            threads = VGroup(Arrow(start=1*UP, end=1*DOWN, color=GREEN_A).move_to(block).shift(0.9*LEFT))
            indices = VGroup(Tex(r"(0,0,{})".format(0)).scale(0.5).next_to(threads[-1], UP))
            for j in range(1, 3):
                threads.add(Arrow(start=1*UP, end=1*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.25*RIGHT))
                indices.add(Tex(r"(0,0,{})".format(j)).scale(0.5).next_to(threads[-1], UP))
            threads_1d.add(threads)
            threads_1d_idx.add(indices)

        self.play(Create(grid_1d))
        self.wait(1)
        self.play(Create(blocks_1d))
        self.wait(1)
        self.play(Write(blocks_1d_idx))
        self.wait(1)
        self.play(Write(threads_1d[0]))
        self.play(Write(threads_1d_idx[0]))
        self.wait(1)
        self.play(Write(threads_1d[1:]))
        self.play(Write(threads_1d_idx[1:]))
        self.wait(1)

        # 1D Grid and 2D Block
        threads_2d = VGroup()
        threads_2d_idx = VGroup()
        for (i,block) in enumerate(blocks_1d):
            threads = VGroup(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_A).move_to(block).shift(LEFT+0.5*UP))
            indices = VGroup(Tex(r"(0,0,{})".format(0)).scale(0.5).next_to(threads[-1], UP))
            for j in range(1, 3):
                threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,0,{})".format(j)).scale(0.5).next_to(threads[-1], UP))
            threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_A).next_to(threads[0], DOWN).shift(0.25*DOWN))
            indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            for j in range(1, 3):
                threads.add(Arrow(start=0.5*UP, end=0.5*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            threads_2d.add(threads)
            threads_2d_idx.add(indices)
        
        self.play(ReplacementTransform(VGroup(threads_1d[0], threads_1d_idx[0]), VGroup(threads_2d[0], threads_2d_idx[0])))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(threads_1d[1:], threads_1d_idx[1:]), VGroup(threads_2d[1:], threads_2d_idx[1:])))
        self.wait(1)

        # 2D Grid and Block
        blocks_2d_ = VGroup(Rectangle(width=3, height=2, color=RED))
        for i in range(3):
            blocks_2d_.add(Rectangle(width=3, height=2, color=RED).next_to(blocks_2d_[-1], RIGHT))
        blocks_2d_.add(Rectangle(width=3, height=2, color=RED).next_to(blocks_2d_[0], DOWN))
        for i in range(3):
            blocks_2d_.add(Rectangle(width=3, height=2, color=RED).next_to(blocks_2d_[-1], RIGHT))
        blocks_2d_.move_to(grid_1d)
            
        blocks_2d__idx = VGroup()
        threads_2d_ = VGroup()
        threads_2d__idx = VGroup()
        for (i,block) in enumerate(blocks_2d_[:4]):
            blocks_2d__idx.add(Tex(r"(0,0,{})".format(i)).scale(0.75).next_to(blocks_2d_[i], UP))
            threads = VGroup(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).move_to(block).shift(0.9*LEFT+0.2*UP))
            indices = VGroup(Tex(r"(0,0,{})".format(0)).scale(0.5).next_to(threads[-1], UP))
            for j in range(1, 3):
                threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,0,{})".format(j)).scale(0.5).next_to(threads[-1], UP))
            threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[0], DOWN).shift(0.2*UP))
            indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            for j in range(1, 3):
                threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            threads_2d_.add(threads)
            threads_2d__idx.add(indices)
        for (i,block) in enumerate(blocks_2d_[4:]):
            blocks_2d__idx.add(Tex(r"(0,1,{})".format(i)).scale(0.75).next_to(blocks_2d_[4+i], DOWN))
            threads = VGroup(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).move_to(block).shift(0.9*LEFT+0.2*UP))
            indices = VGroup(Tex(r"(0,0,{})".format(0)).scale(0.5).next_to(threads[-1], UP))
            for j in range(1, 3):
                threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,0,{})".format(j)).scale(0.5).next_to(threads[-1], UP))
            threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[0], DOWN).shift(0.2*UP))
            indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            for j in range(1, 3):
                threads.add(Arrow(start=0.45*UP, end=0.45*DOWN, color=GREEN_A).next_to(threads[-1], RIGHT).shift(0.5*RIGHT))
                indices.add(Tex(r"(0,1,{})".format(j)).scale(0.5).next_to(threads[-1], DOWN))
            threads_2d_.add(threads)
            threads_2d__idx.add(indices)

        self.play(ReplacementTransform(VGroup(blocks_1d, blocks_1d_idx, threads_2d, threads_2d_idx), VGroup(blocks_2d_, blocks_2d__idx, threads_2d_, threads_2d__idx)))
        self.wait(1)

        title_ = Title("Indices: (z, y, x)")
        self.play(ReplacementTransform(title, title_))
        self.wait(1)

        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

        # Matrix to thread mapping
        C = Matrix(X1,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).to_edge(DOWN).shift(1.5*RIGHT+UP)

        C_ = Matrix(X1_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).move_to(C).shift(0.31*(RIGHT+DOWN))

        A = Matrix(X0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).next_to(C, LEFT)

        B = Matrix(W0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).next_to(C, UP)

        self.play(Create(VGroup(A, B, C[1:])))
        self.wait(1)

        blocks = VGroup()
        threads = VGroup()

        for b0 in range(4):
            for b1 in range(3):
                thread = VGroup()
                for t0 in range(3):
                    for t1 in range(4):
                        C_idx_0 = b0*3 + t0
                        C_idx_1 = b1*4 + t1
                        thread.add(Arrow(start=0.32*UP, end=0.32*DOWN, color=GREEN_B).move_to(C_.get_entries()[C_idx_0*12+C_idx_1]))
                blocks.add(SurroundingRectangle(thread, buff=0.05, color=RED))
                threads.add(thread)

        self.play(Create(blocks[0]), Create(threads[0]))
        self.wait(1)
        self.play(Create(blocks[1:]), Create(threads[1:]))
        self.wait(1)

        # Code
        grid_code = Code(file_name="grid_specs.cu", language="CUDA", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.5).next_to(B, LEFT)
        self.play(Create(grid_code[0]))
        self.play(Write(grid_code.code[0]))
        self.play(Write(grid_code.code[1][:15]), Write(grid_code.code[1][-2:]))
        self.wait(1)
        a1 = Arrow(start=0.7*LEFT, end=0.7*RIGHT).move_to(blocks[-1].get_edge_center(DOWN)+0.15*DOWN)
        a1_text = Tex(r"x=4").scale(0.5).next_to(a1, DOWN).shift(0.15*UP)
        a2 = Arrow(start=0.7*UP, end=0.7*DOWN).move_to(blocks[-1].get_edge_center(RIGHT)+0.15*RIGHT)
        a2_text = Tex(r"y=3").scale(0.5).next_to(a2, RIGHT).shift(0.15*LEFT)
        self.play(Create(a1), Create(a2), Write(a1_text), Write(a2_text))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(a1, a1_text), grid_code.code[1][15:17]))
        self.play(ReplacementTransform(VGroup(a2, a2_text), grid_code.code[1][17:20]))
        self.play(Write(grid_code.code[1][20:-2]))
        self.wait(1)

        a1 = Arrow(start=2*LEFT, end=2*RIGHT).move_to(C_.get_edge_center(DOWN)+0.2*DOWN)
        a1_text = Tex(r"x=3").scale(0.75).next_to(a1, DOWN).shift(0.15*UP)
        a2 = Arrow(start=2*UP, end=2*DOWN).move_to(C_.get_edge_center(RIGHT)+0.15*RIGHT)
        a2_text = Tex(r"y=4").scale(0.75).next_to(a2, RIGHT).shift(0.15*LEFT)
        self.play(Create(a1), Create(a2), Write(a1_text), Write(a2_text))
        self.wait(1)
        self.play(ReplacementTransform(VGroup(a1, a1_text, a2, a2_text), grid_code.code[2]))
        self.wait(1)

        self.play(FadeOut(blocks, threads[0:2], threads[3:5], threads[6:8], 
                        threads[2][0:2], threads[2][4:6], threads[2][8:10],
                        threads[5][0:2], threads[5][4:6], threads[5][8:10],
                        threads[8][0:2], threads[8][4:6], threads[8][8:10],
                        threads[11][0:2],threads[9][:4], threads[10][:4]))
        self.wait(1)
        self.play(Write(grid_code.code[-2:]))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                                 

In [18]:
%%manim -qk -v WARNING Kernel

class Kernel(Scene):
    def construct(self):
        # Code
        grid_code = Code(file_name="kernel.cu", language="CUDA", font="Monospace", insert_line_no=False,
                            style="dracula", line_spacing=1).scale(0.35).to_edge(LEFT).to_edge(DOWN)

        C = Matrix(X1,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).to_edge(DOWN).to_edge(RIGHT).shift(0.25*LEFT+0.25*UP)

        C_ = Matrix(X1_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).move_to(C).shift(0.31*(RIGHT+DOWN))

        A = Matrix(X0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).next_to(C, LEFT)

        B = Matrix(W0,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).next_to(C, UP)

        C__ = Matrix(X1_,
        left_bracket="[",
        right_bracket="]",
        element_to_mobject=MathTex,
        element_alignment_corner=ORIGIN,
        v_buff=1,
        h_buff=1).scale(0.3).next_to(A, UP).shift(0.3*LEFT+0.25*UP)

        grid = SurroundingRectangle(C__, color=GREEN)
        
        blocks = VGroup()
        threads = VGroup()

        for b0 in range(4):
            for b1 in range(3):
                thread = VGroup()
                for t0 in range(3):
                    for t1 in range(4):
                        C_idx_0 = b0*3 + t0
                        C_idx_1 = b1*4 + t1
                        thread.add(Arrow(start=0.32*UP, end=0.32*DOWN, color=GREEN_B).move_to(C__.get_entries()[C_idx_0*12+C_idx_1]))
                blocks.add(SurroundingRectangle(thread, buff=0.05, color=RED))
                threads.add(thread)

        self.play(Create(threads))
        self.play(Create(blocks))
        self.play(Create(grid))
        self.play(Create(grid_code[0]))
        self.play(Write(grid_code.code[0]))
        self.wait(1)

        c = Circle(radius=0.12, color=YELLOW).move_to(threads[4][6])
        self.play(Create(c))
        self.wait(1)

        self.play(Write(grid_code.code[1:7]), Create(C[1:]))
        self.wait(1)
        i = Tex(r"$i=3 \cdot 1 + 1=4$").scale(0.75).next_to(grid_code, UP).shift(UP)
        j = Tex(r"$j=4 \cdot 1 + 2=6$").scale(0.75).next_to(i, DOWN)
        self.play(Write(i[0][:3]))
        self.play(Write(j[0][:3]))
        self.wait(1)
        self.play(Write(i[0][3:5]))
        self.play(Write(j[0][3:5]))
        self.wait(1)
        self.play(Write(i[0][5:]))
        self.play(Write(j[0][5:]))
        self.wait(1)
        self.play(Write(grid_code.code[7:9]))
        self.wait(1)
        self.play(VGroup(threads[4][6], c).animate.move_to(C.get_entries()[4*10+6]))
        r1 = SurroundingRectangle(A.get_rows()[4], color=YELLOW)
        r2 = SurroundingRectangle(B.get_columns()[6], color=YELLOW)
        self.play(Create(A), Create(B), Create(r1), Create(r2))
        self.wait(1)
        self.play(Write(grid_code.code[9:]))
        self.play(ReplacementTransform(VGroup(threads[4][6], c, r1, r2), C.get_entries()[4*10+6]))
        self.wait(1)
        self.play(FadeOut(blocks, grid, i, j), ReplacementTransform(threads, C.get_entries()))
        self.wait(1)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(1)

                                                                                                            

# Conclusion

In [19]:
%%manim -qk -v WARNING Conclusion

class Conclusion(Scene):
    def construct(self):
        dialogue = Tex("Like", " and please do leave a", " Comment", "!").scale(1.25)
        dialogue[0].set_color(RED)
        dialogue[-2].set_color(GREEN)
        self.play(Write(dialogue))
        self.wait(2)
        self.play(
            *[FadeOut(mob)for mob in self.mobjects]
        )
        self.wait(2)

                                                                                                                       