## Measure inference performance of PyTorch model on CPU

First, we are going to measure the inference performance of an already-trained PyTorch model on CPU. 

In [9]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
import time
import numpy as np


In [None]:
from utilities import get_max_item_id, build_model_from_ckpt
from utilities import SSEPT

DEVICE = torch.device("cpu")
max_item_id = get_max_item_id("/mnt/data/evaluation/movielens_192m_eval.txt")

model = build_model_from_ckpt("SSE_PT10kemb.pth", DEVICE, item_num=max_item_id)
model = torch.compile(model)
model.eval()

<pre style="font-size:84%; line-height:1.3em; font-family:monospace;">
OptimizedModule(
  (_orig_mod): SSEPT(
    (user_embedding): Embedding(10001, 100)
    (item_embedding): Embedding(84433, 100, padding_idx=0)
    (position_embedding): Embedding(100, 100)
    (dropout): Dropout(p=0.5, inplace=False)
    (blocks): ModuleList(
      (0-1): 2 x TransformerBlock(
        (attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (ffn): PointWiseFeedForward(
          (conv1): Conv1d(100, 100, kernel_size=(1,), stride=(1,))
          (conv2): Conv1d(100, 100, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.5, inplace=False)
          (activation): ReLU()
        )
      )
    )
    (final_linear): Linear(in_features=200, out_features=100, bias=True)
    (output_layer): Linear(in_features=100, out_features=84432, bias=True)
  )
)
</pre>

and also prepare our test dataset:

In [11]:
from utilities import pad_or_truncate

class SequentialEvalDataset(Dataset):
    def __init__(self, filepath, seq_max_len):
        self.user_sequences = {}
        with open(filepath, "r") as f:
            for line in f:
                uid, iid = map(int, line.strip().split("\t"))
                self.user_sequences.setdefault(uid, []).append(iid)

        # 构造 (user_id, sequence, label)
        self.samples = []
        self.seq_max_len = seq_max_len
        for uid, seq in self.user_sequences.items():
            if len(seq) < 2:
                continue
            self.samples.append((uid, seq[:-1], seq[-1]))

        print(f"Loaded {len(self.samples)} valid sequences from {filepath}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        uid, seq, label = self.samples[idx]
        seq_tensor = torch.tensor(pad_or_truncate(seq, self.seq_max_len), dtype=torch.long)
        return torch.tensor(uid, dtype=torch.long), seq_tensor, torch.tensor(label, dtype=torch.long)

# 加载数据
seq_max_len = model.seq_max_len
dataset = SequentialEvalDataset("/mnt/data/evaluation/movielens_192m_eval.txt", seq_max_len)
loader = DataLoader(dataset, batch_size=64, shuffle=False)


Loaded 1374159 valid sequences from /mnt/data/evaluation/movielens_192m_eval.txt


In [12]:
user_batch, seq_batch, _ = next(iter(loader))

print("Max item id in batch:", seq_batch.max().item())
print("Embedding size:", model.item_embedding.num_embeddings)


Max item id in batch: 77051
Embedding size: 84433


We will measure:

-   the size of the model on disk
-   the latency when doing inference on single samples
-   the throughput when doing inference on batches of data
-   and the test accuracy

#### Model size

We’ll start with model size. 

In [13]:
model_size = os.path.getsize(MODEL_PATH) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 5.56 MB


#### Inference latency

Now, we’ll measure how long it takes the model to return a prediction for a single sample. We will run 100 trials, and then compute aggregate statistics.

In [None]:
uid, seq_tensor, _ = dataset[0]  
user_tensor = uid.unsqueeze(0).to(DEVICE)
seq_tensor = seq_tensor.unsqueeze(0).to(DEVICE)

latencies = []
for _ in range(100):
    start = time.time()
    with torch.no_grad():
        _ = model(user_tensor, seq_tensor)
    latencies.append(time.time() - start)
lat_ms = [t * 1000 for t in latencies]
print(f"Single sample latency median: {torch.median(torch.tensor(lat_ms)):.2f} ms")

print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (95th): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (99th): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Throughput (single sample): {100 / np.sum(latencies):.2f} FPS")

W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] Graph break from `Tensor.item()`, consider setting:
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]     torch._dynamo.config.capture_scalar_outputs = True
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] or:
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] to include these operations in the captured graph.
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] 
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] Graph break: from user code at:
W0513 10:08:59.206000 44643 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]   File "/home/jovyan/work/utilities.py", line 92, in forward
W05

Single sample latency median: 3.19 ms
Inference Latency (median): 3.19 ms
Inference Latency (95th): 3.63 ms
Inference Latency (99th): 112.37 ms
Throughput (single sample): 9.86 FPS


#### Batch throughput

Finally, we’ll measure the rate at which the model can return predictions for batches of data.

In [None]:
num_batches = 50 

# 从 evaluation loader 中取一个 batch
user_batch, seq_batch, _ = next(iter(loader))
user_batch = user_batch.to(DEVICE)
seq_batch = seq_batch.to(DEVICE)

# Warm-up run 
with torch.no_grad():
    _ = model(user_batch, seq_batch)

batch_times = []
with torch.no_grad():
    for _ in range(num_batches):
        start_time = time.time()
        _ = model(user_batch, seq_batch)
        batch_times.append(time.time() - start_time)

batch_fps = (user_batch.shape[0] * num_batches) / np.sum(batch_times) 
print(f"Batch Throughput: {batch_fps:.2f} FPS")


Batch Throughput: 5381.72 FPS


When you are done, download the fully executed notebook from the Jupyter container environment for later reference. (Note: because it is an executable file, and you are downloading it from a site that is not secured with HTTPS, you may have to explicitly confirm the download in some browsers.)

### Eager mode execution vs compiled model

We had just evaluated a model in eager mode. However, in some (although, not all) cases we may get better performance from compiling the model into a graph, and executing it as a graph.

Go back to the cell where the model is loaded, and add

``` python
model.compile()
```

just below the call to `torch.load`. Then, run the notebook again (“Run \> Run All Cells”).

When you are done, download the fully executed notebook **again** from the Jupyter container environment for later reference.

<!-- 

compute_gigaio 

  Model name:             AMD EPYC 7763 64-Core Processor
    CPU family:           25
    Model:                1
    Thread(s) per core:   2
    Core(s) per socket:   64

-->
<!-- summary for mobilenet model

Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 60.16 ms
Inference Latency (single sample, 95th percentile): 77.22 ms
Inference Latency (single sample, 99th percentile): 77.37 ms
Inference Throughput (single sample): 15.82 FPS
Batch Throughput: 83.66 FPS


Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 73.97 ms
Inference Latency (single sample, 95th percentile): 83.16 ms
Inference Latency (single sample, 99th percentile): 83.94 ms
Inference Throughput (single sample): 13.34 FPS
Batch Throughput: 98.80 FPS

-->
<!-- summary for mobilenet compiled model

Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 26.92 ms
Inference Latency (single sample, 95th percentile): 49.79 ms
Inference Latency (single sample, 99th percentile): 64.55 ms
Inference Throughput (single sample): 32.35 FPS
Batch Throughput: 249.08 FPS

Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 34.14 ms
Inference Latency (single sample, 95th percentile): 53.85 ms
Inference Latency (single sample, 99th percentile): 60.23 ms
Inference Throughput (single sample): 27.39 FPS
Batch Throughput: 281.65 FPS

-->
<!-- 

(Intel CPU)

Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 12.69 ms
Inference Latency (single sample, 95th percentile): 12.83 ms
Inference Latency (single sample, 99th percentile): 12.97 ms
Inference Throughput (single sample): 78.73 FPS
Batch Throughput: 161.27 FPS

With compiling

Model Size on Disk: 9.23 MB
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 8.47 ms
Inference Latency (single sample, 95th percentile): 8.58 ms
Inference Latency (single sample, 99th percentile): 8.79 ms
Inference Throughput (single sample): 117.86 FPS
Batch Throughput: 474.67 FPS



-->