In [26]:
import os
import sys
import numpy as np

In [2]:
import torch

In [3]:
from datasets import load_dataset

In [4]:
civil_comments = load_dataset("google/civil_comments")

In [5]:
dataset = civil_comments['train']

### Read the embeddings vector and add it as column in dataset

In [7]:
embeddings = torch.load("embeddings.pth")

In [8]:
# embeddings_tensor = torch.cat(embeddings, dim=0)
embeddings_list = []
for l in embeddings:
    embeddings_list.extend(l)

In [9]:
assert len(embeddings_list) == len(dataset)


In [9]:
# embeddings_dataset = dataset.add_column("embeddings", embeddings_list)

In [19]:
import numpy as np
def get_random_tensors(l, hdim=100):
    """returns l x hdim tensor"""
    return torch.rand(l, hdim)
def get_y(datum):
    """returns rtp signal: bsize x 8"""
    print(datum)
    cont = [d['continuation'] for d in datum]
    return torch.tensor(np.nan_to_num(np.array([list(l.values())[1:] for l in cont], dtype=np.float64)), dtype=torch.float64)

In [10]:
# lets batch encode a dataset:
class randomDataset(torch.utils.data.dataset.Dataset):
    def __init__(self, dataset, hdim=100):
        self.X = get_random_tensors(len(dataset), hdim)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx]
    
        
class Embeddings_Dataset(torch.utils.data.dataset.Dataset):
    def __init__(self, embeddings_list):
        self.X = embeddings_list
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx]

In [11]:
embeddings_dataset = Embeddings_Dataset(embeddings_list)

### Putting it all together

In [12]:
from tqdm import tqdm

In [14]:
# import wandb

In [32]:
# lets put it all together:
# wandb.init(project="deepGenTest")
hdim, odim, bsize, epochs, grad_accm_steps = 100, 8, 64, 10, 1
model = torch.nn.Sequential(
    torch.nn.Linear(hdim, hdim),
    torch.nn.BatchNorm1d(hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    
    torch.nn.Linear(hdim, 2*hdim),
    torch.nn.BatchNorm1d(2*hdim),
    torch.nn.GELU(),

    torch.nn.Linear(2*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    torch.nn.Linear(4*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    torch.nn.Linear(4*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),


    torch.nn.Linear(4*hdim, 2*hdim),
    torch.nn.BatchNorm1d(2*hdim),
    torch.nn.GELU(),

    torch.nn.Linear(2*hdim, hdim),
    torch.nn.BatchNorm1d(hdim),
    torch.nn.GELU(),

    torch.nn.Linear(hdim, 100),
    torch.nn.GELU(),
    torch.nn.Linear(100, odim)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.1, weight_decay=0)
loss_fn = torch.nn.CrossEntropyLoss()
Xdata = randomDataset(dataset)
ydl = torch.utils.data.dataloader.DataLoader(dataset, batch_size=bsize, shuffle=False, collate_fn=get_y)
Xdl = torch.utils.data.dataloader.DataLoader(embeddings_dataset, bsize, shuffle=False)

for e in range(epochs):
    epoch_loss = 0
    for ybatch, xbatch in tqdm(zip(ydl, Xdl), total = len(ydl), leave=True, desc=f"Epoch: {e}"):
        optimizer.zero_grad()
        fwd = model(xbatch)
        loss = loss_fn(fwd, ybatch)
        loss.backward()
        optimizer.step()
        # wandb.log({"batch_loss" : loss.item()})
        epoch_loss += loss.item()
    # wandb.log({"epoch_loss" : epoch_loss})
    print(f"with loss: {epoch_loss}")
    

Epoch: 0:   0%|          | 0/28202 [00:00<?, ?it/s]

[{'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!", 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}, {'text': "Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!", 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}, {'text': 'This is such an urgent design problem; kudos to you for taking it on. Very impressive!', 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}, {'text': "Is this something I'll be able to install on my site? When will you be releasing it?", 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}, {'text': 'haha you




KeyError: 'continuation'

# Lets do this for civil_chat

In [13]:
civil = load_dataset("google/civil_comments")

In [23]:
len(civil)

3

In [24]:
next(iter(civil))

'train'

In [25]:
for k in civil.keys():
    print(len(civil[k]))

1804874
97320
97320


In [26]:
next(iter(civil['train']))

{'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",
 'toxicity': 0.0,
 'severe_toxicity': 0.0,
 'obscene': 0.0,
 'threat': 0.0,
 'insult': 0.0,
 'identity_attack': 0.0,
 'sexual_explicit': 0.0}

In [14]:
def civil_collate(datum):
    return torch.tensor(np.nan_to_num(np.array([list(l.values())[1:] for l in datum], dtype=np.float64)), dtype=torch.float64)

In [28]:
civil_collate([next(iter(civil['train']))])

tensor([[0., 0., 0., 0., 0., 0., 0.]], dtype=torch.float64)

In [17]:
import wandb

In [18]:
wandb.login(key="0a43ac008e0a75d36e8607a63179fe016933177b")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjdannemi[0m ([33mdeepest-learning-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jdannemi/.netrc


True

In [22]:

run = wandb.init(
    name = "first_DNN_regression_model",
    reinit=True,
    # id= "",
    # resume = "must",
    project = "DeepGenProject"
    # Create your wandb run
)

In [29]:
# lets put it all together:
# wandb.init(project="deepGenTest")
hdim, odim, bsize, epochs, grad_accm_steps = 768, 7, 256, 10, 1
model = torch.nn.Sequential(
    torch.nn.Linear(hdim, hdim),
    torch.nn.BatchNorm1d(hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    
    torch.nn.Linear(hdim, 2*hdim),
    torch.nn.BatchNorm1d(2*hdim),
    torch.nn.GELU(),

    torch.nn.Linear(2*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    torch.nn.Linear(4*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),

    torch.nn.Linear(4*hdim, 4*hdim),
    torch.nn.BatchNorm1d(4*hdim),
    torch.nn.Dropout(0.2),
    torch.nn.GELU(),


    torch.nn.Linear(4*hdim, 2*hdim),
    torch.nn.BatchNorm1d(2*hdim),
    torch.nn.GELU(),

    torch.nn.Linear(2*hdim, hdim),
    torch.nn.BatchNorm1d(hdim),
    torch.nn.GELU(),

    torch.nn.Linear(hdim, 100),
    torch.nn.GELU(),
    torch.nn.Linear(100, odim)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.1, weight_decay=0)
loss_fn = torch.nn.CrossEntropyLoss()
# Xdata = randomDataset(civil['train'])
ydl = torch.utils.data.dataloader.DataLoader(civil['train'], batch_size=bsize, shuffle=False, collate_fn=civil_collate)
Xdl = torch.utils.data.dataloader.DataLoader(embeddings_dataset, bsize, shuffle=False)
wandb.watch(model, log="all")
best_epoch_loss = 0.0
for e in range(epochs):
    epoch_loss = 0.0
    for ybatch, xbatch in tqdm(zip(ydl, Xdl), total = len(ydl), leave=True, desc=f"Epoch: {e}"):
        i+=1
        optimizer.zero_grad()
        fwd = model(xbatch)
        loss += loss_fn(fwd, ybatch)
        if i % 4 == 0:
            # every 4 steps:
            loss.backward()
            optimizer.step()
        wandb.log({"batch_loss" : loss.item()})
        epoch_loss += loss.item()
    wandb.log({"epoch_loss" : epoch_loss})
    print(f"with loss: {epoch_loss}")
    if epoch_loss < best_epoch_loss:
        best_epoch_loss = epoch_loss
        torch.save(model.state_dict(), "best_model.pth")
run.finish()
    

Epoch: 0:   0%|          | 0/7051 [00:00<?, ?it/s]

Epoch: 0:  14%|█▍        | 971/7051 [02:47<17:32,  5.78it/s]

In [27]:
y = next(iter(ydl))
model(next(iter(Xdl))).shape

torch.Size([64, 7])

In [28]:
y.shape

torch.Size([64, 7])