In [33]:
import torch, sys, glob, os, math
from pathlib import Path
import numpy as np
from multiprocessing import Pool
from copy import deepcopy
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader
from typing import Optional
import itertools


# DataSets and DataLoaders

In [34]:
sample_locs = np.load("data/sample_locs.npy")


In [35]:
class CC100DataSet(torch.utils.data.Dataset):
    def __init__(self, file_path, num_workers):
        super().__init__()
        self.sample_locs = sample_locs
        self.file_handles = [open(file_path, "rb") for _ in range(num_workers)]
        self.file_handles.append(open(file_path, "rb"))

    def __len__(self):
        return len(self.sample_locs)

    def __getitem__(self, idx):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            worker_id = 0
        else:
            worker_id = worker_info.id
        self.file_handles[worker_id].seek(self.sample_locs[idx])
        count = int.from_bytes(
            self.file_handles[worker_id].read(2), byteorder=sys.byteorder, signed=False
        )
        arr = np.frombuffer(
            self.file_handles[worker_id].read(count * 4), count=count, dtype=np.int32
        )
        return arr


In [36]:
class CC100DataModule(pl.LightningDataModule):
    def __init__(
        self, dataset: torch.utils.data.Dataset, path: str, num_workers: int = 1
    ):
        super().__init__()
        self.dataset = dataset(path, num_workers)
        self.num_workers = num_workers

    def prepare_data(self):
        pass

    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            train_len = int(len(self.dataset) * 0.8)
            train_len = 1000
            self.dataset_train, self.dataset_val = random_split(
                self.dataset,
                [train_len, len(self.dataset) - train_len],
                generator=torch.Generator().manual_seed(42),
            )

        if stage == "test" or stage is None:
            self.dataset_test = self.dataset

        if stage == "predict" or stage is None:
            self.dataset_predict = self.dataset

    def train_dataloader(self):
        return DataLoader(
            self.dataset_train,
            batch_size=2,
            collate_fn=self._collate_wrapper,
            num_workers=self.num_workers,
            shuffle=True,
            persistent_workers=True,
            prefetch_factor=4,
        )

    def val_dataloader(self):
        return DataLoader(
            self.dataset_val,
            batch_size=2,
            collate_fn=self._collate_wrapper,
            num_workers=self.num_workers,
            prefetch_factor=4,
        )

    def test_dataloader(self):
        return DataLoader(
            self.dataset_test,
            batch_size=2,
            collate_fn=self._collate_wrapper,
            num_workers=self.num_workers,
            prefetch_factor=4,
        )

    def predict_dataloader(self):
        return DataLoader(
            self.dataset_predict,
            batch_size=2,
            collate_fn=self._collate_wrapper,
            num_workers=self.num_workers,
            prefetch_factor=4,
        )

    # def transfer_batch_to_device(batch, device, dataloader_idx):
    #     return [torch.as_tensor(x, device=device) for x in batch]

    def on_after_batch_transfer(self, batch, dataloader_idx):
        # TODO batch 1
        batch = batch.unfold(1, min(1024, int(batch.shape[1] / 2)), 1)
        batch = batch[(batch != 3).logical_or(batch != 1).any(axis=2)]
        return batch[:, :-1], batch[:, -1]

    def _collate_wrapper(self, batch):
        # TODO filter large batches
        b_max_len = len(max(batch, key=len))
        if b_max_len > 9999:
            print(b_max_len, max(batch, key=len))
        batch = np.array(
            [
                np.pad(
                    x,
                    (2 * min(1024, b_max_len) - len(x), 0),
                    "constant",
                    constant_values=(3),
                )
                for x in batch
            ]
        )
        # faster right_padding: batch = np.column_stack(list(itertools.zip_longest(*l, fillvalue=3)))
        # TODO type
        return torch.as_tensor(batch, dtype=torch.int32)


In [37]:
a = [1, 2, 3, 4]
# b = [3,4,5]
arr = np.array([a])
ten = torch.as_tensor(arr, dtype=torch.int32)
ten


tensor([[1, 2, 3, 4]], dtype=torch.int32)

In [38]:
ten.unfold(1, 2, 1)


tensor([[[1, 2],
         [2, 3],
         [3, 4]]], dtype=torch.int32)

In [39]:
data_module = CC100DataModule(CC100DataSet, "data/ru_small.bin", 8)
# data_module.prepare_data()
# data_module.setup()
# data_loader = data_module.train_dataloader()


In [40]:
# x = next(iter(data_loader))
# x
# c = 0
# for _ in data_loader:
#     list(map(lambda x: torch.Tensor(x), _))
#     c += 1
# c


In [41]:
# x.shape


In [42]:
# x[0].unfold(0,194,1)


In [43]:
# pad+2
# len(x)/2
# y = x.unfold(1, min(1024, int(x.shape[1] / 2)), 1)
# print(y.shape)
# print((y != 3).logical_or(y != 1).any(axis=2).shape)
# y = y[(y != 3).logical_and(y != 1).any(axis=2)]
# y=y.reshape(-1,194)
# y[(y!=3).any(axis=1)].shape
# y
# y.view(y.size(0), -1)


In [44]:
import random
class TestDataSet(torch.utils.data.Dataset):
    def __init__(self):
        super().__init__()
        self.data = [torch.full((5,), random.choice([0,1,2,3,4,5]), dtype=torch.int32) for x in range(10000)]

    def __len__(self):
        return len(self.data )

    def __getitem__(self, idx):
        return self.data[idx]

    # def _collate_wrapper(self, batch):
    #     return batch[:,:-1], batch[:,-1]

test_ds = TestDataSet()
test_dl = DataLoader(test_ds, batch_size=8)

In [49]:
import os
from torch import optim, nn
import pytorch_lightning as pl

# define any number of nn.Modules (or use your current ones)

# embedding = 
# decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

# define the LightningModule
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, d_emb = 512, vocab_size = 32000):
        super().__init__()
        self.d_emb=d_emb
        self.vocab_size=vocab_size
        self.embedding = nn.Embedding(self.vocab_size, self.d_emb)
        self.encoder = nn.Sequential(nn.Linear(self.d_emb, 2048), nn.Tanh(), nn.Linear(2048, self.d_emb))
        self.w = nn.Parameter(torch.zeros(self.embedding.weight.shape), requires_grad=True) #why
        # self.encoder = nn.Sequential(nn.Linear(4, 128), nn.Tanh(), nn.Linear(128, 1))

    def forward(self, x):
        res = self.encoder(self.embedding(x)).mean(-2) @ self.w.T
        # res = self.encoder(self.embedding(x)).mean(-2) @ self.embedding.weight.T
        # res = self.encoder(x)
        return res
        # return res.softmax(0)

    def training_step(self, batch, batch_idx):
        # x, y = batch[:,:-1], batch[:,-1]
        # y = y.unsqueeze(-1).float()
        x, y = batch
        # self.y_o = y
        y = nn.functional.one_hot(y.clone().detach().long(), num_classes=self.vocab_size).float()
        y_hat = self(x)
        # y_hat = y_hat 
        # y_hat = torch.nn.functional.softmax(y_hat, dim=0)
        # self.x = x
        # self.y = y
        # self.hat = y_hat
        # print(y, y_hat)
        # loss = nn.functional.cross_entropy(y, y_hat,reduction='sum')
        loss = nn.functional.mse_loss(y, y_hat,reduction='sum')
        # print((loss))
        # print((y_hat - y).sum())

        self.log("train_loss", loss, logger=True)
        self.log("train_loss_epoch", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001)
        return optimizer


# init the autoencoder
autoencoder = LitAutoEncoder()
# e1=autoencoder.embedding.weight.clone()
# p1=autoencoder.named_parameters()

In [50]:
# list(p1)

In [51]:
# a= torch.arange(24).reshape(2,3,4)
# print(a)
# a.sum(-2)
# nn.functional.one_hot(torch.tensor([32000]),num_classes=5).float()


In [52]:
trainer = pl.Trainer(limit_train_batches=1000, max_epochs=1, accelerator="gpu", devices=1)
trainer.fit(model=autoencoder, datamodule=data_module)
# trainer.fit(model=autoencoder, train_dataloaders=test_dl)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 16.4 M
1 | encoder   | Sequential | 2.1 M 
-----------------------------------------
34.9 M    Trainable params
0         Non-trainable params
34.9 M    Total params
139.471   Total estimated model params size (MB)


Epoch 0:  14%|█▍        | 71/500 [00:25<02:31,  2.84it/s, loss=463, v_num=7]


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7e1295f910>
Traceback (most recent call last):
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1445, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/connection.py", line 936, in wait
    ready = selector.select(timeout)
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/pytho

Epoch 0:  14%|█▍        | 71/500 [00:45<04:36,  1.55it/s, loss=463, v_num=7]


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7e1295f910>
Traceback (most recent call last):
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1464, in _shutdown_workers
    if w.is_alive():
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


Epoch 0:  14%|█▍        | 71/500 [00:48<04:55,  1.45it/s, loss=463, v_num=7] 


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7e1295f910>
Traceback (most recent call last):
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1464, in _shutdown_workers
    if w.is_alive():
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


Epoch 0:  14%|█▍        | 71/500 [00:53<05:24,  1.32it/s, loss=463, v_num=7]]
Epoch 0:  25%|██▌       | 126/500 [00:28<01:25,  4.36it/s, loss=109, v_num=8]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7e1295f910>
Traceback (most recent call last):
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()


Epoch 0:  25%|██▌       | 127/500 [00:28<01:24,  4.39it/s, loss=109, v_num=8]

  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1464, in _shutdown_workers
    if w.is_alive():
  File "/home/is/armin-sa/miniconda3/envs/pytorch/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


Epoch 0:  36%|███▌      | 181/500 [00:32<00:57,  5.58it/s, loss=95.8, v_num=8]

In [None]:
with torch.no_grad():
    b=autoencoder(torch.full((2,),3))
b.argmax()

tensor(3)

In [None]:
(autoencoder.embedding.weight == e1).all()

NameError: name 'e1' is not defined

In [None]:
autoencoder.y[1]

tensor(0, device='cuda:0')

In [None]:
autoencoder.y_o[1]

tensor(0, device='cuda:0')

In [None]:
autoencoder.hat[1]

tensor([-0.4807], device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
autoencoder.x[1]

tensor([1., 1., 1., 1., 1.], device='cuda:0')