In [1]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

In [2]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
!CUDA_VISIBLE_DEVICES=2

In [4]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [5]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [6]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import numpy as np
import pytorch_warmup as warmup

import wandb


wandb.init(project="NLP_test_Emotional+Anaysis", entity="zzh110", name="base_lr_warmup_val")
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

# parameter
batch_size = 16
lr = 5e-5
epoch = 3

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

optim = AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=0.01) #2000 time steps decay
num_steps = len(train_loader) * epoch
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=num_steps)
warmup_scheduler = warmup.UntunedLinearWarmup(optim)

wandb.config = {
  "learning_rate": lr,
  "epochs": epoch,
  "batch_size": batch_size
}
best_val_loss = 100.0
PATH = "model_save/1_best_loss.pth"
for epoch in range(epoch):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        logits = outputs[1].cpu()
#         print(labels.shape)
#         print(logits.shape)
#         print(torch.argmax(logits,1))
        accuracy = np.mean((labels.cpu()==torch.argmax(logits,1)).numpy())
#         print(accuracy)
        wandb.log({"loss": loss,
                  "Accuracy": accuracy})
        with warmup_scheduler.dampening():
            lr_scheduler.step()
        wandb.log({"lr": lr_scheduler.get_lr()[0]})
        
        if num_steps%1000 == 0:

            model.eval()
            with torch.no_grad():
                val_loss = []
                val_accuracy = []
                for val_batch in val_loader:
                    val_input_ids = val_batch['input_ids'].to(device)
                    val_attention_mask = val_batch['attention_mask'].to(device)
                    val_labels = val_batch['labels'].to(device)
                    val_outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss.append(val_outputs[0])
                    val_logits = val_outputs[1].cpu()
            #         print(labels.shape)
            #         print(logits.shape)
            #         print(torch.argmax(logits,1))
                    val_accuracy.append(np.mean((val_labels.cpu()==torch.argmax(val_logits,1)).numpy()))
                
                val_loss_avg = val_loss / len(val_loss)
                val_accuracy_avg = val_accuracy / len(val_accuracy)
                wandb.log({"val_loss": val_loss_avg,
                       "val_Accuracy": val_accuracy_avg})
            # save best-loss model
                if best_val_loss > val_loss_avg:
                    torch.save(model.state_dict(), PATH)           
            model.train()
            

# Optional
wandb.watch(model)

model.eval()



In [9]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import numpy as np
import pytorch_warmup as warmup

import wandb


wandb.init(project="NLP_test_Emotional+Anaysis", entity="zzh110", name="base_Xoverfit")
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

# parameter
batch_size = 16
lr = 5e-5
epoch = 3

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

optim = AdamW(model.parameters(), lr=lr)
# num_steps = len(train_loader) * epoch
# lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=num_steps)
# warmup_scheduler = warmup.UntunedLinearWarmup(optim)

wandb.config = {
  "learning_rate": lr,
  "epochs": epoch,
  "batch_size": batch_size
}

for epoch in range(epoch):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        logits = outputs[1].cpu()
#         print(labels.shape)
#         print(logits.shape)
#         print(torch.argmax(logits,1))
        accuracy = np.mean((labels.cpu()==torch.argmax(logits,1)).numpy())
#         print(accuracy)
        wandb.log({"loss": loss,
                  "Accuracy": accuracy})
#         with warmup_scheduler.dampening():
#             lr_scheduler.step()
        wandb.log({"lr": lr})

# Optional
wandb.watch(model)

model.eval()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016677044266058753, max=1.0…

wandb: ERROR Error while calling W&B API: run zzh110/NLP_test_Emotional+Anaysis/2s2w8xfg was previously created and deleted; try a new run name (<Response [409]>)


Problem at: /tmp/ipykernel_113284/2919634494.py 9 <module>


KeyboardInterrupt: 

In [50]:
pip install -U pytorch_warmup

Collecting pytorch_warmup
  Downloading pytorch_warmup-0.1.1-py3-none-any.whl (6.6 kB)
Installing collected packages: pytorch_warmup
Successfully installed pytorch_warmup-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
accracy = np.mean((torch.argmax(out,1)==torch.argmax(y,1)).numpy())

In [9]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import wandb


wandb.init(project="NLP_test_Emotional+Anaysis", entity="zzh110", name="base_lr_1")
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

# parameter
batch_size = 16
lr = 1e-5
epoch = 3

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

optim = AdamW(model.parameters(), lr=lr)

wandb.config = {
  "learning_rate": lr,
  "epochs": epoch,
  "batch_size": batch_size
}

for epoch in range(epoch):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        wandb.log({"loss": loss})
    wandb.log({"lr": lr})

# Optional
wandb.watch(model)

model.eval()

VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▇▃▆▆▃▂▃▂▅▃▂▁▂▃▂▁▁▂▄▂▃▄▁▃▃▃▂▁▂▂▂▁▂▃▁▂▁▁▂

0,1
loss,0.01637


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666826643437768, max=1.0)…

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

wandb: Network error (ConnectTimeout), entering retry loop.


In [22]:
from datasets import load_dataset
train = load_dataset("imdb", split="train")

Found cached dataset imdb (/homeB/zhuzhihao/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


In [23]:
print(train.column_names)


['text', 'label']


In [24]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [26]:
train = train.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)
# train.rename_column_("label", "labels")

Loading cached processed dataset at /homeB/zhuzhihao/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-03070988166662d6.arrow


In [16]:
train.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [17]:
{key: val.shape for key, val in train[0].items()}

{'label': torch.Size([]),
 'input_ids': torch.Size([512]),
 'attention_mask': torch.Size([512])}

In [None]:
{key: val.shape for key, val in train[0].items()}

In [27]:
train[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be