In [120]:
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import numpy as numpy

In [140]:
class MLP(nn.Module):
    def __init__(
            self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super().__init__()
        print("input_dim:", input_dim)
        print("input_dim + hidden_dim:", [input_dim] + [hidden_dim])
        print("input_dim + hidden_dim * num_layers", [input_dim] + [hidden_dim] * num_layers)
        print("output_dim: "[output_dim])
        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
        print(layer_sizes)
        self.layers = [
            nn.Linear(idim, odim)
            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
        ]
        print("layers:\n", self.layers)
        print(layer_sizes[:-1]),
        print(layer_sizes[1:])
    def __call__(self, x):
        for l in self.layers[:-1]:
            x = mx.maximum(l(x), 0.0)
        return self.layers[-1](x)
    
mlp = MLP(4, 1, 1, 1)


input_dim: 1
input_dim + hidden_dim: [1, 1]
input_dim + hidden_dim * num_layers [1, 1, 1, 1, 1]
u
[1, 1, 1, 1, 1, 1]
layers:
 [Linear(input_dims=1, output_dims=1, bias=True), Linear(input_dims=1, output_dims=1, bias=True), Linear(input_dims=1, output_dims=1, bias=True), Linear(input_dims=1, output_dims=1, bias=True), Linear(input_dims=1, output_dims=1, bias=True)]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]


In [90]:
c = a + b
mx.eval(c)

In [91]:
c = a + b
print(c)

array([2, 4, 6, 8], dtype=float32)


In [92]:
c = a + b
import numpy as np
np.array(c)


array([2., 4., 6., 8.], dtype=float32)

In [93]:
a = mx.random.normal((100,))
b = mx.random.normal((100,))

In [94]:
c = mx.add(a, b, stream=mx.cpu)
d = mx.add(a, c, stream=mx.gpu)
c

array([0.117236, -3.15298, -0.786867, ..., -0.17244, 1.42161, -1.90438], dtype=float32)

In [95]:
def fun(a, b, d1, d2):
  x = mx.matmul(a, b, stream=d1)
  for _ in range(500):
      b = mx.exp(b, stream=d2)
  return x, b


a = mx.random.uniform(shape=(4096, 512))
b = mx.random.uniform(shape=(512, 4))

In [96]:
import time

start_time = time.time()
x_gpu, b_gpu = fun(a, b, mx.gpu, mx.gpu)
gpu_time = (time.time() - start_time) * 1000  # Convert to milliseconds
print(f"Time for computation fully on GPU: {gpu_time:.2f} milliseconds")

Time for computation fully on GPU: 0.33 milliseconds


In [97]:
# Measure time for computation with GPU and CPU
start_time = time.time()
x_cpu_gpu, b_cpu_gpu = fun(a, b, mx.gpu, mx.cpu)
cpu_gpu_time = (time.time() - start_time) * 1000  # Convert to milliseconds
print(f"Time for computation with GPU and CPU: {cpu_gpu_time:.2f} milliseconds")

Time for computation with GPU and CPU: 0.36 milliseconds


In [98]:
arr = mx.arange(10)
arr

array([0, 1, 2, ..., 7, 8, 9], dtype=int32)

In [99]:
arr[3]

array(3, dtype=int32)

In [100]:
arr[-2]

array(8, dtype=int32)

In [101]:
arr[2:8:2]

array([2, 4, 6], dtype=int32)

In [102]:
arr= mx.arange(8).reshape(2, 2, 2)

arr

array([[[0, 1],
        [2, 3]],
       [[4, 5],
        [6, 7]]], dtype=int32)

In [103]:
arr[:, :, 0]

array([[0, 2],
       [4, 6]], dtype=int32)

In [104]:
arr[..., 0]

array([[0, 2],
       [4, 6]], dtype=int32)

In [105]:
arr = mx.arange(8)
print(arr.shape)
arr[None].shape

(8,)


(1, 8)

In [106]:
import mlx.core as mx

num_features = 100
num_examples = 1_000
num_iters = 10_000  # iterations of SGD
lr = 0.01  # learning rate for SGD

In [107]:
w_star = mx.random.normal((num_features,))
w_star

array([-0.727584, 0.796034, -0.0982778, ..., -0.0917062, -0.884138, -0.462956], dtype=float32)

In [108]:
X = mx.random.normal((num_examples, num_features))
len(X)

1000

In [109]:
# Noisy labels
eps = 1e-2 * mx.random.normal((num_examples,))
y = X @ w_star + eps
y
len(y)

1000

In [110]:
def loss_fn(w):
    return 0.5 * mx.mean(mx.square(X @ w - y))

grad_fn = mx.grad(loss_fn)

In [111]:
import mlx.core as mx
import mlx.nn as nn

class MLP(nn.Module):
    def __init__(self, in_dims: int, out_dims: int):
        super().__init__()

        self.layers = [
            nn.Linear(in_dims, 128),
            nn.Linear(128, 128),
            nn.Linear(128, out_dims),
        ]

    def __call__(self, x):
        for i, l in enumerate(self.layers):
            x = mx.maximum(x, 0) if i > 0 else x
            x = l(x)
        return x

In [112]:
mlp = MLP(2, 10)

In [113]:
# We can access its parameters by calling mlp.parameters()
params = mlp.parameters()
print(params["layers"][0]["weight"].shape)


(128, 2)


In [114]:
# Printing a parameter will cause it to be evaluated and thus initialized
print(params["layers"][0])

# We can also force evaluate all parameters to initialize the model
mx.eval(mlp.parameters())

{'weight': array([[0.549127, -0.294675],
       [-0.509866, -0.687971],
       [-0.284932, 0.482793],
       ...,
       [-0.208546, 0.471682],
       [-0.294317, -0.128217],
       [0.0661728, -0.32373]], dtype=float32), 'bias': array([0.145026, 0.255807, -0.00748521, ..., 0.48975, -0.127781, 0.456474], dtype=float32)}


In [115]:
# A simple loss function.
# NOTE: It doesn't matter how it uses the mlp model. It currently captures
#       it from the local scope. It could be a positional argument or a
#       keyword argument.
def l2_loss(x, y):
    y_hat = mlp(x)
    return (y_hat - y).square().mean()

In [116]:
# Calling `nn.value_and_grad` instead of `mx.value_and_grad` returns the
# gradient with respect to `mlp.trainable_parameters()`
loss_and_grad = nn.value_and_grad(mlp, l2_loss)

In [117]:
print(mlp)

MLP(
  (layers.0): Linear(input_dims=2, output_dims=128, bias=True)
  (layers.1): Linear(input_dims=128, output_dims=128, bias=True)
  (layers.2): Linear(input_dims=128, output_dims=10, bias=True)
)


In [118]:
from mlx.utils import tree_map
shapes = tree_map(lambda p: p.shape, mlp.parameters())
shapes

{'layers': [{'weight': (128, 2), 'bias': (128,)},
  {'weight': (128, 128), 'bias': (128,)},
  {'weight': (10, 128), 'bias': (10,)}]}

In [119]:
from mlx.utils import tree_flatten
num_params = sum(v.size for _, v in tree_flatten(mlp.parameters()))
num_params

18186