In [80]:
import time
import torch
import torch.nn as nn
import numpy as np
import plotly.express as px
import pandas as pd

from tqdm import tqdm


import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch import tensor
from datasets import load_dataset
from transformers import BertTokenizer

Testing the low-rank property of matrix P, before & after applying softmax

In [81]:
def svd(matrix):
    return np.linalg.svd(matrix, compute_uv=False)


def plot(M, line=None):
    svd_list = svd(M)
    cumsum = np.cumsum(svd_list / sum(svd_list))
    cumsum = np.insert(cumsum, 0, 0)
    df = pd.DataFrame({'Eigenvalue Index': range(len(cumsum)), 'Normalized Cumulative Eigenvalue': cumsum})
    fig = px.line(df, x='Eigenvalue Index', y='Normalized Cumulative Eigenvalue', title='SVD Plot')
    if line is not None:
        fig.add_vline(x=line, line_dash="dash", line_color="red")
    fig.show()

In [82]:
high_rank_matrix = np.random.normal(size=(512, 512))

In [83]:
plot(high_rank_matrix)

In [84]:
A = np.random.normal(size=(512, 128))
B = np.random.normal(size=(128, 512))
low_rank_matrix = A @ B  #rank=128

In [85]:
plot(low_rank_matrix, line=128)

In [86]:
plot(torch.softmax(torch.tensor(low_rank_matrix), dim=1))

We remark that after applying the softmax function, the yielded matrix isn't as low rank as before.

Now we'll study the performance of Linformer against a basic transformer model using random generated data as inputs.

Implementation of Linformer and of a basic Transformer model:


In [87]:
class FeedForward(nn.Module):
    def __init__(self, d_embed):
        super().__init__()
        self.d_embed = d_embed
        self.linear_1 = nn.Linear(d_embed, 2 * d_embed)
        self.linear_2 = nn.Linear(2 * d_embed, d_embed)
        self.activation = nn.ReLU()

    def forward(self, x):
        y = self.linear_1(x)
        y = self.activation(y)
        y = self.linear_2(y)
        return y

In [88]:
class LinformerAttention(nn.Module):
    def __init__(self, d_embed: int, seq_len: int, dropout: float, k_value: int):
        super().__init__()

        self.q_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.k_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.v_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.k_value = k_value
        self.E = nn.Linear(seq_len, k_value, bias=False)
        self.F = nn.Linear(seq_len, k_value, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.d_embed = d_embed
        self.seq_len = seq_len

    def forward(self, x):
        # q shape : batch size (1) x seq_length x embedding_dim
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)

        # Projection
        k = self.E(k.transpose(-2, -1)).transpose(-2, -1)
        v = self.F(v.transpose(-2, -1)).transpose(-2, -1)

        s_ij = torch.bmm(q, k.mT)
        alpha_ij = nn.functional.softmax(s_ij, dim=-1)
        z = torch.bmm(alpha_ij, v)
        z = self.dropout(z)
        return z

In [89]:
class Linformer(nn.Module):
    def __init__(self, d_embed, seq_len, dropout, k):
        super().__init__()
        self.attention = LinformerAttention(d_embed, seq_len, dropout, k)
        self.feed_forward = FeedForward(d_embed)
        self.layer_norm_attention = nn.LayerNorm(d_embed)
        self.layer_norm_feed_forward = nn.LayerNorm(d_embed)

    def forward(self, x):
        z = self.attention(x)
        z = z + x
        z = self.layer_norm_attention(z)
        y = self.feed_forward(z)
        y = y + z
        y = self.layer_norm_feed_forward(y)
        return y

In [90]:
class Attention(nn.Module):
    def __init__(self, d_embed, dropout):
        super().__init__()
        self.q_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.k_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.v_proj = nn.Linear(d_embed, d_embed, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        s_ij = torch.bmm(q, k.mT)
        alpha_ij = nn.functional.softmax(s_ij, dim=-1)
        z = torch.bmm(alpha_ij, v)
        z = self.dropout(z)
        return z

In [91]:
class BasicTransformerLayer(nn.Module):
    def __init__(self, d_embed, dropout):
        super().__init__()
        self.attention = Attention(d_embed, dropout)
        self.feed_forward = FeedForward(d_embed)
        self.layer_norm_attention = nn.LayerNorm(d_embed)
        self.layer_norm_feed_forward = nn.LayerNorm(d_embed)

    def forward(self, x):
        z = self.attention(x)
        z = z + x
        z = self.layer_norm_attention(z)
        y = self.feed_forward(z)
        y = y + z
        y = self.layer_norm_feed_forward(y)
        return y

Testing of Memory consumption & Speed difference between Linformer & Transformer with varying k_value, i.e. with varying projected dimension.

In [92]:
seq_len = 128
batch_size = 32
d_embed = 512
dropout = 0.1

input_data = torch.rand(batch_size, seq_len, d_embed).to('cuda')

# inputs_data = [torch.rand(batch_size,seq_len, d_embed).to('cuda') for _ in range(1000)]

k_values = [i for i in range(1, seq_len + 1)]

time_results_transformer = {}
memory_results_transformer = {}

time_results_linformer = {}
memory_results_linformer = {}

In [93]:
# Here we decided to use, even though it is obviously useless as k has no impact
# on Transformers, the same testing architecture as for the Linformer model,
# to avoid any time/memory bias due to the testing architecture.

for k in tqdm(k_values):
    torch.cuda.reset_max_memory_allocated()  # Reset memory usage counter
    model = BasicTransformerLayer(d_embed, dropout).to('cuda')
    torch.cuda.synchronize()  # Wait for all kernels to finish
    start_time = time.time()
    with torch.no_grad():
        #for input_data in inputs_data:
        output = model(input_data)
    torch.cuda.synchronize()  # Wait for all kernels to finish
    time_taken = time.time() - start_time
    memory_used = torch.cuda.max_memory_allocated()  # Get peak memory usage

    time_results_transformer[k] = time_taken
    memory_results_transformer[k] = memory_used


torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

100%|██████████| 128/128 [00:00<00:00, 139.04it/s]


In [94]:
for k in tqdm(k_values):
    torch.cuda.reset_max_memory_allocated()  # Reset memory usage counter
    model = Linformer(d_embed, seq_len, dropout, k).to('cuda')
    torch.cuda.synchronize()  # Wait for all kernels to finish
    start_time = time.time()
    with torch.no_grad():
        #for input_data in inputs_data:
        output = model(input_data)
    torch.cuda.synchronize()  # Wait for all kernels to finish
    time_taken = time.time() - start_time
    memory_used = torch.cuda.max_memory_allocated()  # Get peak memory usage

    time_results_linformer[k] = time_taken
    memory_results_linformer[k] = memory_used

100%|██████████| 128/128 [00:00<00:00, 154.97it/s]


In [95]:
time_data = pd.DataFrame({'Linformer Time results': time_results_linformer.values(),
                          'Transformer Time results': time_results_transformer.values()},
                         index=time_results_linformer.keys())
print(time_data)

memory_data = pd.DataFrame({'Linformer Memory results': memory_results_linformer.values(),
                            'Transformer Memory results': memory_results_transformer.values()},
                           index=memory_results_linformer.keys())
print(memory_data)

     Linformer Time results  Transformer Time results
1                  0.001000                  0.020005
2                  0.001000                  0.004000
3                  0.001001                  0.007000
4                  0.001000                  0.004000
5                  0.001000                  0.003999
..                      ...                       ...
124                0.001001                  0.000000
125                0.000000                  0.000000
126                0.000999                  0.000000
127                0.000000                  0.000000
128                0.001002                  0.000000

[128 rows x 2 columns]
     Linformer Memory results  Transformer Memory results
1                   142305280                   148782592
2                   141745664                   148782592
3                   141933568                   148595712
4                   141747712                   148221952
5                   142496256         

In [96]:
fig = px.line(time_data, labels={'value': 'Time consumption', 'index': 'projection dimension k'},
              title='Time complexity of Linformer vs Transformer')
fig.show()
fig = px.line(memory_data, labels={'value': 'Memory consumption', 'index': 'projection dimension k'},
              title='Memory complexity of Linformer vs Transformer')
fig.show()

After the first report, we decided it would be more insightful to give as inputs real world data. Additionally, we trained the model on this given data as it would now makes sense to do so. We slightly edited the previous classes to handle the conversion from tokens to embeddings.

In [97]:
class BasicTransformerLayer(nn.Module):
    def __init__(self, d_embed, num_classes, dropout):
        super().__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, d_embed)
        self.attention = Attention(d_embed, dropout)
        self.feed_forward = FeedForward(d_embed)
        self.layer_norm_attention = nn.LayerNorm(d_embed)
        self.layer_norm_feed_forward = nn.LayerNorm(d_embed)
        self.fc_out = nn.Linear(d_embed, num_classes)

    def forward(self, x):
        # Embedding layer to convert token indices to embeddings
        x = self.embedding(x)  
        z = self.attention(x)
        z = z + x
        z = self.layer_norm_attention(z)
        y = self.feed_forward(z)
        y = y + z
        y = self.layer_norm_feed_forward(y)
        # Final classification layer
        y = y.mean(dim=1)
        y = self.fc_out(y)
        return y


In [98]:
dataset = load_dataset("scikit-learn/imdb", split="train")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
n_samples = 50000
dataset = dataset.shuffle(seed=2023)

d_embed = 50
num_classes = 2
B = 500
E = 10

seq_len = 256

time_results_transformer = {}
memory_results_transformer = {}

time_results_linformer = {}
memory_results_linformer = {}

k_values = [i for i in range(1, seq_len + 1)]

In [99]:
def preprocessing_fn(examples):
    tokenized_batch = tokenizer(
        examples["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=seq_len,
        padding='max_length',
        return_attention_mask=False
    )
    labels = [0 if sentiment == "negative" else 1 for sentiment in examples["sentiment"]]
    tokenized_batch = {k: tensor(v) for k, v in tokenized_batch.items()}
    tokenized_batch["labels"] = tensor(labels)
    return tokenized_batch


split_dataset = dataset.select(range(n_samples))

tokenized_dataset = split_dataset.map(preprocessing_fn, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels'])

train_valid_split = tokenized_dataset.train_test_split(test_size=0.2)

train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']


class ClassificationDataset(Dataset):
    def __init__(self, tokenized_data):
        self.reviews = tokenized_data['input_ids']
        self.labels = tokenized_data['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'review': self.reviews[idx],
            'label': self.labels[idx]
        }


train_set = ClassificationDataset(train_valid_split['train'])
valid_set = ClassificationDataset(train_valid_split['test'])

train_dataloader = DataLoader(dataset=train_set, batch_size=B)
valid_dataloader = DataLoader(dataset=valid_set, batch_size=B)

criterion = nn.CrossEntropyLoss()

# Training of transformer

In [100]:
torch.cuda.reset_max_memory_allocated()  # Reset memory usage counter
model_transformer = BasicTransformerLayer(d_embed, num_classes, dropout).to('cuda')
torch.cuda.synchronize()  # Wait for all kernels to finish
start_time = time.time()

optimizer_transformer = optim.Adam(model_transformer.parameters(), lr=0.001)

train_loss_trace_transformer = []
valid_loss_trace_transformer = []

for epoch in tqdm(range(E)):
    model_transformer.train()
    train_loss = 0
    for batch in train_dataloader:
        reviews, labels = batch['review'].to('cuda'), batch['label'].to('cuda')
        optimizer_transformer.zero_grad()
        outputs = model_transformer(reviews)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_transformer.step()
        train_loss += loss.item()
    torch.cuda.synchronize()  # Wait for all kernels to finish
    time_taken = time.time() - start_time
    memory_used = torch.cuda.max_memory_allocated()  # Get peak memory usage

    time_results_transformer[k] = time_taken
    memory_results_transformer[k] = memory_used
    train_loss = train_loss / len(train_dataloader)
    train_loss_trace_transformer.append(train_loss)
    model_transformer.eval()
    with torch.no_grad():
        valid_loss = 0

        for batch in valid_dataloader:
            reviews, labels = batch['review'].to("cuda"), batch['label'].to("cuda")
            outputs = model_transformer(reviews)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()

    valid_loss = valid_loss / len(valid_dataloader)
    valid_loss_trace_transformer.append(valid_loss)
    print(f"Epoch: {epoch}, Training Loss: {train_loss}, Valid Loss: {valid_loss}")


torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

 10%|█         | 1/10 [00:00<00:08,  1.00it/s]

Epoch: 0, Training Loss: 0.6436757542192936, Valid Loss: 0.569489648938179


 20%|██        | 2/10 [00:01<00:07,  1.12it/s]

Epoch: 1, Training Loss: 0.5046243447810411, Valid Loss: 0.4545330390334129


 30%|███       | 3/10 [00:02<00:05,  1.18it/s]

Epoch: 2, Training Loss: 0.41712255105376245, Valid Loss: 0.4117554783821106


 40%|████      | 4/10 [00:03<00:04,  1.22it/s]

Epoch: 3, Training Loss: 0.365537478774786, Valid Loss: 0.3901593118906021


 50%|█████     | 5/10 [00:04<00:04,  1.20it/s]

Epoch: 4, Training Loss: 0.3255642060190439, Valid Loss: 0.38271668255329133


 60%|██████    | 6/10 [00:05<00:03,  1.22it/s]

Epoch: 5, Training Loss: 0.2892100499942899, Valid Loss: 0.38579548001289365


 70%|███████   | 7/10 [00:05<00:02,  1.24it/s]

Epoch: 6, Training Loss: 0.2568530077114701, Valid Loss: 0.395237971842289


 80%|████████  | 8/10 [00:06<00:01,  1.24it/s]

Epoch: 7, Training Loss: 0.22495425604283809, Valid Loss: 0.41475309282541273


 90%|█████████ | 9/10 [00:07<00:00,  1.26it/s]

Epoch: 8, Training Loss: 0.19780639037489892, Valid Loss: 0.4410803332924843


100%|██████████| 10/10 [00:08<00:00,  1.21it/s]

Epoch: 9, Training Loss: 0.1723117345944047, Valid Loss: 0.47336569130420686





# Training of Linformer

In [101]:
class Linformer(nn.Module):
    def __init__(self, d_embed, seq_len, num_classes, dropout, k):
        super().__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, d_embed)
        self.attention = LinformerAttention(d_embed, seq_len, dropout, k)
        self.feed_forward = FeedForward(d_embed)
        self.layer_norm_attention = nn.LayerNorm(d_embed)
        self.layer_norm_feed_forward = nn.LayerNorm(d_embed)
        self.fc_out = nn.Linear(d_embed, num_classes)

    def forward(self, x):
        x = self.embedding(x)  
        z = self.attention(x)
        z = z + x
        z = self.layer_norm_attention(z)
        y = self.feed_forward(z)
        y = y + z
        y = self.layer_norm_feed_forward(y)
        y = y.mean(dim=1)
        y = self.fc_out(y)
        return y

In [None]:
valid_losses_linformer = pd.DataFrame(columns=[f"k = {k}" for k in k_values])
train_losses_linformer = pd.DataFrame(columns=[f"k = {k}" for k in k_values])

for k in tqdm(k_values): 
    torch.cuda.reset_max_memory_allocated()  # Reset memory usage counter
    model_linformer = Linformer(d_embed, seq_len, num_classes, dropout, k).to('cuda')
    torch.cuda.synchronize()  # Wait for all kernels to finish
    start_time = time.time()
    optimizer_linformer = optim.Adam(model_linformer.parameters(), lr=0.001)
    
    train_loss_trace_linformer = []
    valid_loss_trace_linformer = []
    
    for epoch in tqdm(range(E)):
        model_linformer.train()
        train_loss = 0
        for batch in train_dataloader:
            reviews, labels = batch['review'].to('cuda'), batch['label'].to('cuda')
            optimizer_linformer.zero_grad()
            outputs = model_linformer(reviews)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_linformer.step()
            train_loss += loss.item()
    
        train_loss = train_loss / len(train_dataloader)
        train_loss_trace_linformer.append(train_loss)
        model_linformer.eval()
        with torch.no_grad():
            valid_loss = 0
    
            for batch in valid_dataloader:
                reviews, labels = batch['review'].to("cuda"), batch['label'].to("cuda")
                outputs = model_linformer(reviews)
                loss = criterion(outputs, labels)
                valid_loss += loss.item()
        torch.cuda.synchronize()  # Wait for all kernels to finish
        time_taken = time.time() - start_time
        memory_used = torch.cuda.max_memory_allocated()  # Get peak memory usage
        time_results_linformer[k] = time_taken
        memory_results_linformer[k] = memory_used
        valid_loss = valid_loss / len(valid_dataloader)
        valid_loss_trace_linformer.append(valid_loss)
        print(f"Epoch: {epoch}, Training Loss: {train_loss}, Valid Loss: {valid_loss}, k = {k}")
    valid_losses_linformer[f"k = {k}"] = valid_loss_trace_linformer
    train_losses_linformer[f"k = {k}"] = train_loss_trace_linformer

In [116]:
time_data = pd.DataFrame({'Linformer Time results': time_results_linformer.values(),
                          'Transformer Time results': time_results_transformer.values()},
                         index=time_results_linformer.keys())

memory_data = pd.DataFrame({'Linformer Memory results': memory_results_linformer.values(),
                            'Transformer Memory results': memory_results_transformer.values()},
                           index=memory_results_linformer.keys())
fig = px.line(time_data, labels={'value': 'Time consumption', 'index': 'projection dimension k'},
              title='Time complexity of Linformer vs Transformer')

fig.show()
fig = px.line(memory_data, labels={'value': 'Memory consumption', 'index': 'projection dimension k'},
              title='Memory complexity of Linformer vs Transformer')

fig.show()

In [106]:
print(valid_loss_trace_transformer)

[0.569489648938179, 0.4545330390334129, 0.4117554783821106, 0.3901593118906021, 0.38271668255329133, 0.38579548001289365, 0.395237971842289, 0.41475309282541273, 0.4410803332924843, 0.47336569130420686]


In [107]:
print(valid_losses_linformer)

      k = 1     k = 2     k = 3     k = 4     k = 5     k = 6     k = 7  \
0  0.657015  0.668266  0.662703  0.671280  0.669303  0.662919  0.616687   
1  0.508554  0.510423  0.506883  0.546222  0.538771  0.514956  0.481558   
2  0.430645  0.420976  0.426712  0.443291  0.441656  0.439684  0.408603   
3  0.380744  0.370903  0.387805  0.395792  0.397072  0.391743  0.376668   
4  0.358135  0.347112  0.365428  0.364812  0.372924  0.373137  0.362414   
5  0.350628  0.336925  0.356525  0.349479  0.362052  0.368118  0.356906   
6  0.345922  0.334515  0.346147  0.354542  0.361156  0.367796  0.359221   
7  0.347335  0.336084  0.346020  0.346360  0.370300  0.366697  0.375819   
8  0.350775  0.344234  0.353646  0.357524  0.390708  0.394926  0.406774   
9  0.353236  0.351201  0.365391  0.374781  0.412850  0.394019  0.452611   

      k = 8     k = 9    k = 10  ...   k = 247   k = 248   k = 249   k = 250  \
0  0.604718  0.669921  0.675126  ...  0.592758  0.664552  0.591131  0.630056   
1  0.481163  0

In [117]:
fig = px.line(valid_losses_linformer, labels={'value': 'Losses values', 'index': 'Epochs'},
              title='Validation losses of Linformer with varying projection dimension k')

fig.show()

There is of course some overfitting happening here.

In [118]:
fig = px.line(train_losses_linformer, labels={'value': 'Losses values', 'index': 'Epochs'},
              title='Train losses of Linformer with varying projection dimension k')

fig.show()

In [119]:
fig = px.line(train_loss_trace_transformer, labels={'value': 'Losses values', 'index': 'Epochs'},
              title='Train losses of Transformer')

fig.show()

In [120]:
fig = px.line(valid_loss_trace_transformer, labels={'value': 'Losses values', 'index': 'Epochs'},
              title='Validation losses of Transformer')

fig.show()