## Helper functions

In [36]:
import pickle


def load_loss_values(folder_name: str, is_train: bool) -> list[float]:
    file_prefix = "train" if is_train else "val"
    file_path = f"../loss_values/{folder_name}/{file_prefix}_losses.pkl"
    with open(file_path, "rb") as f:
        loss_values = pickle.load(f)
    return loss_values

In [88]:
import numpy as np
import plotly.graph_objects as go

from plotly.graph_objs._figure import Figure
from plotly.subplots import make_subplots


def plot_losses(
    line_configs: list[dict],
    iterations: np.ndarray,
    title: str,
    xaxis_title: str,
    yaxis_title: str,
    width: int = 1200,
    height: int = 700,
    yaxis_range: list[float] = None,
    save_figure: bool = False,
    legend_y_anchor: str = "top",
    legend_y: float = 1,
    legend_x_anchor: str = "right",
    legend_x: float = 1,
) -> Figure:
    figure = make_subplots(specs=[[{"secondary_y": True}]])

    for config in line_configs:
        figure.add_trace(
            go.Scatter(
                x=iterations,
                y=config["data"],
                mode="lines+markers",
                name=config["name"],
                hovertemplate="<b>Iteration</b>: %{x}<br><b>Loss</b>: %{y:.4f}<br>",
                line=dict(
                    color=config["color"], width=config["width"], dash=config["dash"]
                ),
                marker=dict(
                    symbol=config["marker_symbol"],
                    size=8,
                    color=config["color"],
                    line=dict(width=1, color="black"),
                ),
            ),
            secondary_y=False,
        )

    figure.update_layout(
        font=dict(family="Arial", size=14, color="black"),
        title={
            "text": title,
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": dict(size=18),
        },
        autosize=False,
        width=width,
        height=height,
        margin=dict(l=80, r=50, b=150, t=100, pad=10),
        showlegend=True,
        xaxis=dict(
            title=xaxis_title,
            showgrid=True,
            gridcolor="lightgrey",
            tickmode="linear",
            tick0=0,
            dtick=max(1, len(iterations) // 10),
        ),
        yaxis=dict(
            title=yaxis_title,
            showgrid=True,
            gridcolor="lightgrey",
            zeroline=True,
            zerolinecolor="black",
            zerolinewidth=1,
            range=yaxis_range,
        ),
        legend=dict(
            yanchor=legend_y_anchor,
            y=legend_y,
            xanchor=legend_x_anchor,
            x=legend_x,
            bgcolor="rgba(255, 255, 255, 0.7)",
            bordercolor="lightgrey",
            borderwidth=1,
        ),
        plot_bgcolor="white",
    )

    if save_figure:
        figure.write_image("plot.svg", format="svg")
    return figure

## Comparisons

### 1. Positional encoding

#### 1.1. Load the data

In [38]:
positional_encodings = {
    "absolute": {
        "train": load_loss_values("absolute_positional_encoding", True),
        "val": load_loss_values("absolute_positional_encoding", False),
    },
    "relative": {
        "train": load_loss_values("relative_positional_encoding", True),
        "val": load_loss_values("relative_positional_encoding", False),
    },
    "sinusoidal": {
        "train": load_loss_values("sinusoidal_positional_encoding", True),
        "val": load_loss_values("sinusoidal_positional_encoding", False),
    },
    "none": {
        "train": load_loss_values("no_positional_encoding", True),
        "val": load_loss_values("no_positional_encoding", False),
    },
    "rope": {
        "train": load_loss_values("rope", True),
        "val": load_loss_values("rope", False),
    },
}

#### 1.2. Plot the training data

In [39]:
line_configs = [
    {
        "name": "Absolute positional encoding",
        "data": positional_encodings["absolute"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Sinusoidal positional encoding",
        "data": positional_encodings["sinusoidal"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Relative positional encoding",
        "data": positional_encodings["relative"]["train"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "No positional encoding",
        "data": positional_encodings["none"]["train"],
        "color": "#d62728",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Rotary positional encoding (RoPE)",
        "data": positional_encodings["rope"]["train"],
        "color": "#9467bd",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
]
iterations = np.arange(1, len(positional_encodings["none"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different positional encodings",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.5, 7],
    save_figure=False,
)
figure.show()

#### 1.3. Plot the validation data

In [40]:
line_configs = [
    {
        "name": "Absolute positional encoding",
        "data": positional_encodings["absolute"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Sinusoidal positional encoding",
        "data": positional_encodings["sinusoidal"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Relative positional encoding",
        "data": positional_encodings["relative"]["val"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "No positional encoding",
        "data": positional_encodings["none"]["val"],
        "color": "#d62728",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Rotary positional encoding (RoPE)",
        "data": positional_encodings["rope"]["val"],
        "color": "#9467bd",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
]
iterations = np.arange(1, len(positional_encodings["none"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different positional encodings",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4, 6],
    save_figure=False,
)
figure.show()

#### 1.4. Conclusion

Rotary Positional Encoding (RoPE) and Relative Positional Encoding (RPE) showed strong performance on this benchmark. RoPE slightly outperformed RPE on the validation set, but both significantly outperformed the other approaches.

In contrast, Sinusoidal and the absence of Positional Encoding yielded the weakest results, with Sinusoidal Positional Encoding performing the worst overall.

### 2. Attention mechanisms

#### 2.1. Load the data

In [41]:
attention_mechanisms = {
    "local": {
        "train": load_loss_values("local_attention", True),
        "val": load_loss_values("local_attention", False),
    },
    "multi_query": {
        "train": load_loss_values("multi_query_attention", True),
        "val": load_loss_values("multi_query_attention", False),
    },
    "grouped": {
        "train": load_loss_values("grouped_query_attention", True),
        "val": load_loss_values("grouped_query_attention", False),
    },
    "linear": {
        "train": load_loss_values("linear_attention", True),
        "val": load_loss_values("linear_attention", False),
    },
    "big_bird": {
        "train": load_loss_values("big_bird", True),
        "val": load_loss_values("big_bird", False),
    },
    "latent": {
        "train": load_loss_values("multi_head_latent_attention", True),
        "val": load_loss_values("multi_head_latent_attention", False),
    },
}

#### 2.2. Plot the training data

In [51]:
line_configs = [
    {
        "name": "Local attention",
        "data": attention_mechanisms["local"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Multi Query attention",
        "data": attention_mechanisms["multi_query"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Grouped query attention",
        "data": attention_mechanisms["grouped"]["train"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "Linear attention",
        "data": attention_mechanisms["linear"]["train"],
        "color": "#d62728",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Sparse attention (Big Bird)",
        "data": attention_mechanisms["big_bird"]["train"],
        "color": "#9467bd",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
    {
        "name": "Multi-head latent attention",
        "data": attention_mechanisms["latent"]["train"],
        "color": "#8c564b",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 5,
    },
]
iterations = np.arange(1, len(attention_mechanisms["local"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different attention mechanisms",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    save_figure=False,
)
figure.show()

#### 2.3. Plot the validation data

In [53]:
line_configs = [
    {
        "name": "Local attention",
        "data": attention_mechanisms["local"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Multi Query attention",
        "data": attention_mechanisms["multi_query"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Grouped query attention",
        "data": attention_mechanisms["grouped"]["val"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "Linear attention",
        "data": attention_mechanisms["linear"]["val"],
        "color": "#d62728",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Sparse attention (Big Bird)",
        "data": attention_mechanisms["big_bird"]["val"],
        "color": "#9467bd",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
    {
        "name": "Multi-head latent attention",
        "data": attention_mechanisms["latent"]["val"],
        "color": "#8c564b",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 5,
    },
]
iterations = np.arange(1, len(attention_mechanisms["local"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different attention mechanisms",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    save_figure=False,
)
figure.show()

#### 2.4. Conclusion

The results indicate that Grouped Query Attention (GQA) delivers the strongest performance on both the training and validation sets, surpassing all other attention mechanisms. Local Attention comes in second, showing a notable improvement over Multi Query Attention (MQA).

The substantial performance gap between GQA and the other mechanisms suggests that GQA is especially well-suited for the dataset used in this training. However, it's important to note that this may not generalize to all datasets.

### 3. Activation functions

#### 3.1. Load the data

In [54]:
activation_functions = {
    "gelu": {
        "train": load_loss_values("multi_head_latent_attention_gelu", True),
        "val": load_loss_values("multi_head_latent_attention_gelu", False),
    },
    "swiglu": {
        "train": load_loss_values("multi_head_latent_attention_swiglu", True),
        "val": load_loss_values("multi_head_latent_attention_swiglu", False),
    },
    "relu": {
        "train": load_loss_values("multi_head_latent_attention", True),
        "val": load_loss_values("multi_head_latent_attention", False),
    },
}

#### 3.2. Plot the training data

In [57]:
line_configs = [
    {
        "name": "ReLU",
        "data": activation_functions["relu"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "GeLU",
        "data": activation_functions["gelu"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "SwiGLU",
        "data": activation_functions["swiglu"]["train"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
]
iterations = np.arange(1, len(activation_functions["relu"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different activation functions",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.8, 5.5],
    save_figure=False,
)
figure.show()

#### 3.3. Plot the validation data

In [60]:
line_configs = [
    {
        "name": "ReLU",
        "data": activation_functions["relu"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "GeLU",
        "data": activation_functions["gelu"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "SwiGLU",
        "data": activation_functions["swiglu"]["val"],
        "color": "#2ca02c",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
]
iterations = np.arange(1, len(activation_functions["relu"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different activation functions",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4.5, 5],
    save_figure=False,
)
figure.show()

#### 3.4. Conclusion

SwiGLU outperformed both ReLU and GELU on the training and validation sets.

### 4. Normalization methods

#### 4.1. Load the data

In [68]:
normalization_methods = {
    "rms_norm": {
        "train": load_loss_values("multi_head_latent_attention_swiglu_rms_norm", True),
        "val": load_loss_values("multi_head_latent_attention_swiglu_rms_norm", False),
    },
    "layer_norm": {
        "train": load_loss_values("multi_head_latent_attention_swiglu", True),
        "val": load_loss_values("multi_head_latent_attention_swiglu", False),
    },
}

#### 4.2. Plot the training data

In [69]:
line_configs = [
    {
        "name": "RMSNorm",
        "data": normalization_methods["rms_norm"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "LayerNorm",
        "data": normalization_methods["layer_norm"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(normalization_methods["rms_norm"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different normalization methods",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.9, 5.5],
    save_figure=False,
)
figure.show()

#### 4.3. Plot the validation data

In [70]:
line_configs = [
    {
        "name": "RMSNorm",
        "data": normalization_methods["rms_norm"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "LayerNorm",
        "data": normalization_methods["layer_norm"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(normalization_methods["rms_norm"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different normalization methods",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4.5, 4.8],
    save_figure=False,
)
figure.show()

#### 4.4. Conclusion

LayerNorm slightly outperforms RMSNorm in this case, but the difference is not significant.

### 5. Normalization placement

#### 5.1. Load the data

In [71]:
normalization_placement = {
    "post_layer_norm": {
        "train": load_loss_values(
            "multi_head_latent_attention_swiglu_post_normalization", True
        ),
        "val": load_loss_values(
            "multi_head_latent_attention_swiglu_post_normalization", False
        ),
    },
    "pre_layer_norm": {
        "train": load_loss_values("multi_head_latent_attention_swiglu", True),
        "val": load_loss_values("multi_head_latent_attention_swiglu", False),
    },
}

#### 5.2. Plot the training data

In [73]:
line_configs = [
    {
        "name": "Post LayerNorm",
        "data": normalization_placement["post_layer_norm"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Pre LayerNorm",
        "data": normalization_placement["pre_layer_norm"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(normalization_placement["post_layer_norm"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different normalization placements",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.7, 5.3],
    save_figure=False,
)
figure.show()

#### 5.3. Plot the validation data

In [76]:
line_configs = [
    {
        "name": "Post LayerNorm",
        "data": normalization_placement["post_layer_norm"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Pre LayerNorm",
        "data": normalization_placement["pre_layer_norm"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(normalization_placement["post_layer_norm"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different normalization placements",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4.5, 5],
    save_figure=False,
)
figure.show()

#### 5.4. Conclusion

Post LayerNorm outperforms Pre LayerNorm on both training and validation sets, indicating that the placement of normalization has a significant impact on model performance.

### 6. Dropout or not

#### 6.1. Load the data

In [78]:
dropout_or_not = {
    "no_dropout": {
        "train": load_loss_values(
            "multi_head_latent_attention_swiglu_post_normalization_no_dropout", True
        ),
        "val": load_loss_values(
            "multi_head_latent_attention_swiglu_post_normalization_no_dropout", False
        ),
    },
    "dropout": {
        "train": load_loss_values("multi_head_latent_attention_swiglu", True),
        "val": load_loss_values("multi_head_latent_attention_swiglu", False),
    },
}

#### 6.2. Plot the training data

In [79]:
line_configs = [
    {
        "name": "No dropout",
        "data": dropout_or_not["no_dropout"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Dropout",
        "data": dropout_or_not["dropout"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(dropout_or_not["dropout"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different dropout settings",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.7, 5.3],
    save_figure=False,
)
figure.show()

#### 6.3. Plot the validation data

In [81]:
line_configs = [
    {
        "name": "No dropout",
        "data": dropout_or_not["no_dropout"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Dropout",
        "data": dropout_or_not["dropout"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
]
iterations = np.arange(1, len(dropout_or_not["dropout"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different dropout settings",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4.3, 5],
    save_figure=False,
)
figure.show()

#### 6.4. Conclusion

Not using dropout yielded the best performance on both training and validation sets, suggesting that dropout may not be beneficial for LLMs when training for one epoch.

### 7. Putting everything together

#### 7.1. Load the data

In [84]:
architecture_evolution = {
    "phase_0": {
        "train": load_loss_values("multi_head_latent_attention", True),
        "val": load_loss_values("multi_head_latent_attention", False),
    },
    "phase_1": {
        "train": load_loss_values("final_model_phase_1", True),
        "val": load_loss_values("final_model_phase_1", False),
    },
    "phase_2": {
        "train": load_loss_values("final_model_phase_2", True),
        "val": load_loss_values("final_model_phase_2", False),
    },
    "phase_3": {
        "train": load_loss_values("final_model_phase_3", True),
        "val": load_loss_values("final_model_phase_3", False),
    },
    "phase_4": {
        "train": load_loss_values("final_model_phase_4", True),
        "val": load_loss_values("final_model_phase_4", False),
    },
}

#### 7.2. Plot the training data

In [96]:
line_configs = [
    {
        "name": "Phase 0: Positional Encoding + MLA + ReLU + Pre-Norm + LayerNorm",
        "data": architecture_evolution["phase_0"]["train"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Phase 1: RoPE + MLA + ReLU + Pre-Norm + LayerNorm",
        "data": architecture_evolution["phase_1"]["train"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Phase 2: RoPE + MLA + ReLU + Post-Norm + LayerNorm",
        "data": architecture_evolution["phase_2"]["train"],
        "color": "#009E73",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "Phase 3: RoPE + MLA + Swiglu + Post-Norm + LayerNorm",
        "data": architecture_evolution["phase_3"]["train"],
        "color": "#D55E00",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Phase 4: RoPE + MLA + Swiglu + Post-Norm + LayerNorm + No Dropout",
        "data": architecture_evolution["phase_4"]["train"],
        "color": "#CC79A7",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
]
iterations = np.arange(1, len(architecture_evolution["phase_1"]["train"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Training losses for different architecture phases",
    xaxis_title="Iterations",
    yaxis_title="Training loss",
    width=1200,
    height=700,
    yaxis_range=[4.5, 5.3],
    save_figure=False,
    legend_y_anchor="bottom",
    legend_y=0.025,
    legend_x_anchor="left",
    legend_x=0.025,
)
figure.show()

#### 7.3. Plot the validation data

In [98]:
line_configs = [
    {
        "name": "Phase 0: Positional Encoding + MLA + ReLU + Pre-Norm + LayerNorm",
        "data": architecture_evolution["phase_0"]["val"],
        "color": "#1f77b4",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 0,
    },
    {
        "name": "Phase 1: RoPE + MLA + ReLU + Pre-Norm + LayerNorm",
        "data": architecture_evolution["phase_1"]["val"],
        "color": "#ff7f0e",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 1,
    },
    {
        "name": "Phase 2: RoPE + MLA + ReLU + Post-Norm + LayerNorm",
        "data": architecture_evolution["phase_2"]["val"],
        "color": "#009E73",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 2,
    },
    {
        "name": "Phase 3: RoPE + MLA + Swiglu + Post-Norm + LayerNorm",
        "data": architecture_evolution["phase_3"]["val"],
        "color": "#D55E00",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 3,
    },
    {
        "name": "Phase 4: RoPE + MLA + Swiglu + Post-Norm + LayerNorm + No Dropout",
        "data": architecture_evolution["phase_4"]["val"],
        "color": "#CC79A7",
        "dash": "solid",
        "width": 2,
        "marker_symbol": 4,
    },
]
iterations = np.arange(1, len(architecture_evolution["phase_1"]["val"]) + 1)

figure = plot_losses(
    line_configs=line_configs,
    iterations=iterations,
    title="Validation losses for different architecture phases",
    xaxis_title="Iterations",
    yaxis_title="Validation loss",
    width=1200,
    height=700,
    yaxis_range=[4.1, 4.8],
    save_figure=False,
    legend_y_anchor="bottom",
    legend_y=0.025,
    legend_x_anchor="left",
    legend_x=0.025,
)
figure.show()

#### 7.4. Conclusion

We clearly see that the performance keeps improving with each new component added to the model. The final model, which includes RoPE, MLA, SwiGLU, LayerNorm, Post LayerNorm, and no dropout, achieves the best performance on both training and validation sets.