In [1]:
import os
import datetime

def printbar():
    nowtime = datetime.datetime.now().strftime('%Y-%m%d %H:%M%S')
    print(f'\n======================================={nowtime}')

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 

In [2]:
import torch
import torchtext
import torchkeras

  from .autonotebook import tqdm as notebook_tqdm
2023-08-03 08:45:23.577810: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-03 08:45:23.620140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import numpy as np
import pandas as pd
import torch 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

MIN_FREQ = 30
MAX_LEN = 200
BATCH_SIZE = 20

df_train = pd.read_csv('/data/snlp/zhangjl/datas/ctr/eat_pytorch_datasets/imdb/train.tsv', 
                       sep='\t', header=None, names=['label', 'text'])

df_test = pd.read_csv('/data/snlp/zhangjl/datas/ctr/eat_pytorch_datasets/imdb/test.tsv', 
                       sep='\t', header=None, names=['label', 'text'])


In [4]:
tokenizer = get_tokenizer('basic_english')

PAD_IDX, UNK_IDX = 0, 1
special_symbols = ['', '']

In [5]:
def yield_tokens(dfdata):
    for text in dfdata['text']:
        yield tokenizer(text)

In [6]:
vocab = build_vocab_from_iterator(
    yield_tokens(df_train),
    min_freq=MIN_FREQ,
    # specials=special_symbols,
    special_first=True
    
)


In [7]:
vocab.set_default_index(UNK_IDX)
vocab_size = len(vocab)

print(f'vocab size is {vocab_size}')

# 查看前20 
print(f'vocab pre top 20 str {vocab.get_itos()[:20]}\n')


vocab size is 8811
vocab pre top 20 str ['the', '.', ',', 'and', 'a', 'of', 'to', "'", 'is', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie']



In [8]:
# pad
def pad(seq, max_length, pad_value = 0):
    n = len(seq)
    result = seq + [pad_value] * max_length
    return result[:max_length]


# code transfer
def text_pipline(text):
    words = tokenizer(text)
    tokens = vocab(words)
    result = pad(tokens, MAX_LEN, PAD_IDX)
    return result

print(text_pipline("this is an example"))

[12, 8, 39, 459, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [9]:
from torch.utils.data import Dataset, DataLoader

class ImdbDataset(Dataset):
    def __init__(self, df):
        self.df = df 
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df['text'].iloc[index]
        label = torch.tensor([self.df['label'].iloc[index]]).float()
        tokens = torch.tensor(text_pipline(text)).int()
        return tokens, label
    
ds_train = ImdbDataset(df_train)
ds_test = ImdbDataset(df_test)

In [10]:
dl_train = DataLoader(ds_train, batch_size=50, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=50, shuffle=False)

In [11]:
import torch
from torch import nn
torch.manual_seed(42)

<torch._C.Generator at 0x7ffb20b69370>

In [12]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=3,
                                      padding_idx=0)
        
        self.conv = nn.Sequential()
        self.conv.add_module('conv_1', nn.Conv1d(in_channels=3, out_channels=16, kernel_size=5))
        self.conv.add_module('pool_1', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_1', nn.ReLU())

        self.conv.add_module('conv_2', nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))
        self.conv.add_module('pool_2', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_2', nn.ReLU())


        self.dense = nn.Sequential()
        self.dense.add_module('flatten', nn.Flatten())
        self.dense.add_module('linear', nn.Linear(6144, 1)) 

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2) # batch x seqlen x emblen -> batch x emblen x seqlen 
        x = self.conv(x)
        y = self.dense(x)
        return y
net = Net()
print(net)
        


Net(
  (embedding): Embedding(8811, 3, padding_idx=0)
  (conv): Sequential(
    (conv_1): Conv1d(3, 16, kernel_size=(5,), stride=(1,))
    (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_1): ReLU()
    (conv_2): Conv1d(16, 128, kernel_size=(2,), stride=(1,))
    (pool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_2): ReLU()
  )
  (dense): Sequential(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear): Linear(in_features=6144, out_features=1, bias=True)
  )
)


In [13]:
import numpy as np
import pandas as pd
import datetime 
from tqdm import tqdm

import torch
from torch import nn
from copy import deepcopy

def printlog(info):
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s"%nowtime)
    print(str(info)+"\n")

printlog('test')


test



In [17]:
class StepRunner:
    def __init__(self, net, loss_fn, stage='train', metrics_dict=None,
                 optimizer=None, lr_scheduler=None):
        self.net, self.loss_fn, self.metrics_dict, self.stage = net,loss_fn,metrics_dict,stage
        self.optimizer, self.lr_scheduler = optimizer, lr_scheduler

    def __call__(self, features, labels):
        # loss
        preds = self.net(features)
        loss = self.loss_fn(preds, labels)
    
        # backward
        if self.optimizer is not None and self.stage == 'train':
            loss.backward()
            self.optimizer.step()
            if self.lr_scheduler is not None:
                self.lr_scheduler.step()

            self.optimizer.zero_grad()

        # metrics
        step_metrics = {self.stage+"_"+name:metric_fn(preds, labels).item() 
                        for name,metric_fn in self.metrics_dict.items()}
        
        return loss.item(), step_metrics
    

class EpochRunner:
    def __init__(self,steprunner):
        self.steprunner = steprunner
        self.stage = steprunner.stage
        self.steprunner.net.train() if self.stage=="train" else self.steprunner.net.eval()


    def __call__(self,dataloader):
        total_loss,step = 0,0
        loop = tqdm(enumerate(dataloader), total =len(dataloader))
        for i, batch in loop: 
            if self.stage=="train":
                loss, step_metrics = self.steprunner(*batch)
            else:
                with torch.no_grad():
                    loss, step_metrics = self.steprunner(*batch)
            step_log = dict({self.stage+"_loss":loss},**step_metrics)

            total_loss += loss
            step+=1
            if i!=len(dataloader)-1:
                loop.set_postfix(**step_log) # set_postfix 是 bar 后面的信息， set_description 是bar前面的信息
            else:
                epoch_loss = total_loss/step
                epoch_metrics = {self.stage+"_"+name:metric_fn.compute().item() 
                                for name,metric_fn in self.steprunner.metrics_dict.items()}
                epoch_log = dict({self.stage+"_loss":epoch_loss},**epoch_metrics)
                loop.set_postfix(**epoch_log)

                for name,metric_fn in self.steprunner.metrics_dict.items():
                    metric_fn.reset()
        return epoch_log

In [27]:
import sys
class KerasModel(torch.nn.Module):
    def __init__(self,net,loss_fn,metrics_dict=None,optimizer=None,lr_scheduler = None):
        super().__init__()
        self.history = {}
        
        self.net = net
        self.loss_fn = loss_fn
        self.metrics_dict = nn.ModuleDict(metrics_dict) 
        
        self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(
            self.parameters(), lr=1e-2)
        self.lr_scheduler = lr_scheduler

    def forward(self, x):
        if self.net:
            return self.net.forward(x)
        else:
            raise NotImplementedError


    def fit(self, train_data, val_data=None, epochs=10, ckpt_path='checkpoint.pt', 
            patience=5, monitor="val_loss", mode="min"):

        for epoch in range(1, epochs+1):
            printlog("Epoch {0} / {1}".format(epoch, epochs))
            
            # 1，train -------------------------------------------------  
            train_step_runner = StepRunner(net = self.net,stage="train",
                    loss_fn = self.loss_fn,metrics_dict=deepcopy(self.metrics_dict),
                    optimizer = self.optimizer, lr_scheduler = self.lr_scheduler)
            train_epoch_runner = EpochRunner(train_step_runner)
            train_metrics = train_epoch_runner(train_data)
            
            for name, metric in train_metrics.items():
                self.history[name] = self.history.get(name, []) + [metric]

            # 2，validate -------------------------------------------------
            if val_data:
                val_step_runner = StepRunner(net = self.net,stage="val",
                    loss_fn = self.loss_fn,metrics_dict=deepcopy(self.metrics_dict))
                val_epoch_runner = EpochRunner(val_step_runner)
                with torch.no_grad():
                    val_metrics = val_epoch_runner(val_data)
                val_metrics["epoch"] = epoch
                for name, metric in val_metrics.items():
                    self.history[name] = self.history.get(name, []) + [metric]
            
            # 3，early-stopping -------------------------------------------------
            if not val_data:
                continue
            arr_scores = self.history[monitor]
            best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
            if best_score_idx==len(arr_scores)-1:
                torch.save(self.net.state_dict(),ckpt_path)
                print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
                     arr_scores[best_score_idx]),file=sys.stderr)
            if len(arr_scores)-best_score_idx>patience:
                print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
                    monitor,patience),file=sys.stderr)
                break 
                
        self.net.load_state_dict(torch.load(ckpt_path))  
        return pd.DataFrame(self.history)

    @torch.no_grad()
    def evaluate(self, val_data):
        val_step_runner = StepRunner(net = self.net,stage="val",
                    loss_fn = self.loss_fn,metrics_dict=deepcopy(self.metrics_dict))
        val_epoch_runner = EpochRunner(val_step_runner)
        val_metrics = val_epoch_runner(val_data)
        return val_metrics
        
       
    @torch.no_grad()
    def predict(self, dataloader):
        self.net.eval()
        result = torch.cat([self.forward(t[0]) for t in dataloader])
        return result.data

In [28]:
import torchmetrics 

class Accuracy(torchmetrics.Accuracy):
    def __init__(self, dist_sync_on_step=False):
        super().__init__(dist_sync_on_step=dist_sync_on_step)
        
    def update(self, preds: torch.Tensor, targets: torch.Tensor):
        super().update(torch.sigmoid(preds),targets.long())
            
    def compute(self):
        return super().compute()
    
net = Net() 
model = KerasModel(net,
                  loss_fn = nn.BCEWithLogitsLoss(),
                  optimizer= torch.optim.Adam(net.parameters(),lr = 0.01),  
                  metrics_dict = {"acc":Accuracy(task='binary')}
                )

In [29]:
model.fit(dl_train,
    val_data=dl_test,
    epochs=10,
    ckpt_path='checkpoint.pt',
    patience=3,
    monitor='val_acc',
    mode='max')


Epoch 1 / 10



100%|██████████| 400/400 [00:07<00:00, 56.23it/s, train_acc=0.499, train_loss=0.703]
100%|██████████| 100/100 [00:01<00:00, 74.09it/s, val_acc=0.512, val_loss=0.694]
<<<<<< reach best val_acc : 0.5123999714851379 >>>>>>



Epoch 2 / 10



100%|██████████| 400/400 [00:07<00:00, 56.31it/s, train_acc=0.5, train_loss=0.693] 
100%|██████████| 100/100 [00:01<00:00, 77.35it/s, val_acc=0.51, val_loss=0.694]



Epoch 3 / 10



100%|██████████| 400/400 [00:06<00:00, 58.61it/s, train_acc=0.668, train_loss=0.595]
100%|██████████| 100/100 [00:01<00:00, 79.53it/s, val_acc=0.76, val_loss=0.496]
<<<<<< reach best val_acc : 0.7598000168800354 >>>>>>



Epoch 4 / 10



100%|██████████| 400/400 [00:06<00:00, 59.66it/s, train_acc=0.799, train_loss=0.448]
100%|██████████| 100/100 [00:01<00:00, 79.44it/s, val_acc=0.796, val_loss=0.443]
<<<<<< reach best val_acc : 0.7961999773979187 >>>>>>



Epoch 5 / 10



100%|██████████| 400/400 [00:06<00:00, 58.41it/s, train_acc=0.844, train_loss=0.366]
100%|██████████| 100/100 [00:01<00:00, 72.81it/s, val_acc=0.805, val_loss=0.436]
<<<<<< reach best val_acc : 0.8051999807357788 >>>>>>



Epoch 6 / 10



100%|██████████| 400/400 [00:06<00:00, 57.52it/s, train_acc=0.87, train_loss=0.314]
100%|██████████| 100/100 [00:01<00:00, 77.35it/s, val_acc=0.807, val_loss=0.435]
<<<<<< reach best val_acc : 0.807200014591217 >>>>>>



Epoch 7 / 10



100%|██████████| 400/400 [00:06<00:00, 57.73it/s, train_acc=0.892, train_loss=0.273]
100%|██████████| 100/100 [00:01<00:00, 78.88it/s, val_acc=0.803, val_loss=0.459]



Epoch 8 / 10



100%|██████████| 400/400 [00:07<00:00, 55.53it/s, train_acc=0.908, train_loss=0.237]
100%|██████████| 100/100 [00:01<00:00, 71.46it/s, val_acc=0.8, val_loss=0.51] 



Epoch 9 / 10



100%|██████████| 400/400 [00:07<00:00, 55.83it/s, train_acc=0.921, train_loss=0.209]
100%|██████████| 100/100 [00:01<00:00, 72.36it/s, val_acc=0.799, val_loss=0.507]
<<<<<< val_acc without improvement in 3 epoch, early stopping >>>>>>


Unnamed: 0,train_loss,train_acc,val_loss,val_acc,epoch
0,0.703175,0.4993,0.693996,0.5124,1
1,0.693388,0.49955,0.693694,0.5098,2
2,0.594592,0.66825,0.496322,0.7598,3
3,0.448057,0.7985,0.44323,0.7962,4
4,0.366164,0.84435,0.436347,0.8052,5
5,0.314394,0.86955,0.43486,0.8072,6
6,0.272615,0.8918,0.458876,0.8034,7
7,0.237346,0.90785,0.510217,0.7996,8
8,0.20926,0.92125,0.507311,0.7986,9


In [30]:

import pandas as pd 

history = model.history
dfhistory = pd.DataFrame(history) 
dfhistory 


Unnamed: 0,train_loss,train_acc,val_loss,val_acc,epoch
0,0.703175,0.4993,0.693996,0.5124,1
1,0.693388,0.49955,0.693694,0.5098,2
2,0.594592,0.66825,0.496322,0.7598,3
3,0.448057,0.7985,0.44323,0.7962,4
4,0.366164,0.84435,0.436347,0.8052,5
5,0.314394,0.86955,0.43486,0.8072,6
6,0.272615,0.8918,0.458876,0.8034,7
7,0.237346,0.90785,0.510217,0.7996,8
8,0.20926,0.92125,0.507311,0.7986,9


In [32]:
def predict(net,dl):
    net.eval()
    with torch.no_grad():
        result = nn.Sigmoid()(torch.cat([net.forward(t[0]) for t in dl]))
    return(result.data)

y_pred_probs = predict(net,dl_test)
y_pred_probs

tensor([[0.9512],
        [0.9304],
        [0.9289],
        ...,
        [0.9141],
        [0.4350],
        [0.9366]])

In [33]:
# 模型权重已经被保存在了ckpt_path='checkpoint.pt'
net_clone = Net()
net_clone.load_state_dict(torch.load('checkpoint.pt'))

<All keys matched successfully>