In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
df = pd.read_csv("./ifeng_data/train.csv", header=None, sep="\t", names=["label", "sentence1"])
df

Unnamed: 0,label,sentence1
0,政务_文旅,安徽三祖寺发现佛牙舍利 初步判断为宋代皇家所赐
1,科技_数码,员工曝苹果2022年春季发布会要来了！iPhone SE 3即将发布
2,汽车_行业,俄乌战争使大众向中国和美国转移产能
3,科技_车科技,再也不怕事故扯皮！所有新车全面上线黑匣子：比行车记录仪好用
4,体育_中国足球,终身禁赛！体育总局和公安部联手严查严打赌球假球行为
...,...,...
3915,政务_发展治理,住建部约谈五城背后：东莞房价涨幅超深圳、房企南通激战抢地
3916,体育_NBA,欧文回应与哈登不和传言：别把我名字放到这些傻瓜文章里！
3917,政务_发展治理,统计局：中国3月CPI同比增长0.4%，环比下降0.5%
3918,娱乐_电影,她这组写真照真不错


In [3]:
labels = sorted(df["label"].value_counts().index.tolist())

labels1 = sorted(set([x.split("_")[0] for x in labels]))
labels2 = sorted(set([x.split("_")[1] for x in labels]))
print(labels1, len(labels1))
print(labels2, len(labels2))

['体育', '娱乐', '政务', '时尚', '汽车', '科技'] 6
['5G', 'CBA', 'NBA', '中国足球', '区块链', '反腐', '发展治理', '国际足球', '地方', '导购', '情感', '手机', '政策', '数码', '文旅', '新车', '时装', '明星', '电影', '电视', '美容', '行业', '试驾', '车科技', '音乐'] 25


In [4]:
df["label"].value_counts()

科技_数码      247
科技_手机      234
体育_NBA     172
娱乐_音乐      166
时尚_时装      165
时尚_美容      163
科技_车科技     162
政务_地方      161
体育_国际足球    161
体育_中国足球    160
娱乐_电影      159
汽车_新车      159
政务_发展治理    158
政务_反腐      157
汽车_试驾      157
体育_CBA     157
娱乐_电视      155
娱乐_明星      155
时尚_情感      153
汽车_行业      152
政务_政策      149
科技_区块链     146
政务_文旅      106
汽车_导购       84
科技_5G       82
Name: label, dtype: int64

In [5]:
labels1_dict = dict(zip(labels1, range(len(labels1))))
labels2_dict = dict(zip(labels2, range(len(labels2))))
print(labels1_dict)
print(labels2_dict)

{'体育': 0, '娱乐': 1, '政务': 2, '时尚': 3, '汽车': 4, '科技': 5}
{'5G': 0, 'CBA': 1, 'NBA': 2, '中国足球': 3, '区块链': 4, '反腐': 5, '发展治理': 6, '国际足球': 7, '地方': 8, '导购': 9, '情感': 10, '手机': 11, '政策': 12, '数码': 13, '文旅': 14, '新车': 15, '时装': 16, '明星': 17, '电影': 18, '电视': 19, '美容': 20, '行业': 21, '试驾': 22, '车科技': 23, '音乐': 24}


In [6]:
# 构建一个层级的字典, 每个层级上, 当前 label_id => 父级 label_id
parent_dict = defaultdict(dict)
# 第一层不变
for label, label_id in labels1_dict.items():
    parent_dict[0][label_id] = label_id
for label, label_id in labels2_dict.items():
    for l in labels:
        label0, label1 = l.split("_")
        if label1 == label:
            break
    parent_dict[1][label_id] = labels1_dict[label0]
print(parent_dict)

parent_index_list = []
# 这个实际上直接迭代 parent_dict 也可以得到
parent_index_list.append([parent_dict[0][x] for x in range(len(labels1))])
parent_index_list.append([parent_dict[1][x] for x in range(len(labels2))])
print(parent_index_list)

defaultdict(<class 'dict'>, {0: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}, 1: {0: 5, 1: 0, 2: 0, 3: 0, 4: 5, 5: 2, 6: 2, 7: 0, 8: 2, 9: 4, 10: 3, 11: 5, 12: 2, 13: 5, 14: 2, 15: 4, 16: 3, 17: 1, 18: 1, 19: 1, 20: 3, 21: 4, 22: 4, 23: 5, 24: 1}})
[[0, 1, 2, 3, 4, 5], [5, 0, 0, 0, 5, 2, 2, 0, 2, 4, 3, 5, 2, 5, 2, 4, 3, 1, 1, 1, 3, 4, 4, 5, 1]]


In [7]:
labels1_weight_dict = dict((x, 0) for x in labels1_dict.keys())
labels2_weight_dict = dict((x, 0) for x in labels2_dict.keys())
for label in df["label"]:
    label1, label2 = label.strip().split("_")
    labels1_weight_dict[label1] += 1
    labels2_weight_dict[label2] += 1

assert len(labels1_weight_dict) == len(labels1_dict)
assert len(labels2_weight_dict) == len(labels2_dict)

print(labels1_weight_dict)
print(labels2_weight_dict)

# 用当前最大的类别数, 除以当前类别数
labels1_weight_list = []
max_labels = max(labels1_weight_dict.values())
for label, label_id in labels1_dict.items():
    labels1_weight_list.append(max_labels / labels1_weight_dict[label])

labels2_weight_list = []
max_labels = max(labels2_weight_dict.values())
for label, label_id in labels2_dict.items():
    labels2_weight_list.append(max_labels / labels2_weight_dict[label])

print(labels1_weight_list)
print(labels2_weight_list)

{'体育': 650, '娱乐': 635, '政务': 731, '时尚': 481, '汽车': 552, '科技': 871}
{'5G': 82, 'CBA': 157, 'NBA': 172, '中国足球': 160, '区块链': 146, '反腐': 157, '发展治理': 158, '国际足球': 161, '地方': 161, '导购': 84, '情感': 153, '手机': 234, '政策': 149, '数码': 247, '文旅': 106, '新车': 159, '时装': 165, '明星': 155, '电影': 159, '电视': 155, '美容': 163, '行业': 152, '试驾': 157, '车科技': 162, '音乐': 166}
[1.34, 1.3716535433070867, 1.1915184678522572, 1.8108108108108107, 1.5778985507246377, 1.0]
[3.0121951219512195, 1.5732484076433122, 1.436046511627907, 1.54375, 1.6917808219178083, 1.5732484076433122, 1.5632911392405062, 1.5341614906832297, 1.5341614906832297, 2.9404761904761907, 1.6143790849673203, 1.0555555555555556, 1.657718120805369, 1.0, 2.330188679245283, 1.5534591194968554, 1.496969696969697, 1.5935483870967742, 1.5534591194968554, 1.5935483870967742, 1.5153374233128833, 1.625, 1.5732484076433122, 1.5246913580246915, 1.4879518072289157]


In [8]:
import os

import numpy as np
from transformers import AutoTokenizer, BertModel
import torch
import torch.nn  as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [10]:
df_train = pd.read_csv("./ifeng_data/train.csv", header=None, sep="\t", names=["label", "sentence1"])
df_test = pd.read_csv("./ifeng_data/test.csv", header=None, sep="\t", names=["label", "sentence1"])
print(len(df_train))
print(len(df_test))

3920
980


In [11]:
class MyDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        label = self.df.iloc[idx, 0]
        label1, label2 = label.strip().split("_")
        label1 = labels1_dict[label1]
        label2 = labels2_dict[label2]

        sentence1 = self.df.iloc[idx, 1]
        sentence1 = tokenizer(sentence1, padding="max_length", truncation=True, max_length=32)
        input_ids = torch.tensor(sentence1["input_ids"])
        attention_mask = torch.tensor(sentence1["attention_mask"])
        token_type_ids = torch.tensor(sentence1["token_type_ids"])
        return input_ids, attention_mask, token_type_ids, label1, label2

dataset_train = MyDataset(df_train)
dataset_test = MyDataset(df_test)

In [12]:
next(iter(dataset_train))

(tensor([ 101, 2128, 2551,  676, 4862, 2191, 1355, 4385,  867, 4280, 5650, 1164,
         1159, 3635, 1161, 3171,  711, 2129,  807, 4640, 2157, 2792, 6606,  102,
            0,    0,    0,    0,    0,    0,    0,    0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 2,
 14)

In [13]:
dataloader_train = DataLoader(dataset_train, batch_size=16, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)

In [14]:
input_ids, attention_mask, token_type_ids, label1, label2 = next(iter(dataloader_train))
print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)
print(label1.shape)
print(label2.shape)

torch.Size([16, 32])
torch.Size([16, 32])
torch.Size([16, 32])
torch.Size([16])
torch.Size([16])


In [15]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = BertModel.from_pretrained("bert-base-chinese")
        self.fc1 = nn.Linear(self.bert_model.config.hidden_size, len(labels1))
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(self.bert_model.config.hidden_size, len(labels2))
        self.dropout2 = nn.Dropout(0.5)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask, token_type_ids)
        pooled_output = output[1]

        pooled_output1 = self.dropout1(pooled_output)
        pooled_output2 = self.dropout2(pooled_output)

        logits1 = self.fc1(pooled_output1)
        logits2 = self.fc2(pooled_output2)
        return logits1, logits2

model = MyModel()
model

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MyModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [17]:

from typing import Optional, Sequence

import torch
from torch import Tensor
from torch import nn
from torch.nn import functional as F


class FocalLoss(nn.Module):
    """ Focal Loss, as described in https://arxiv.org/abs/1708.02002.
    It is essentially an enhancement to cross entropy loss and is
    useful for classification tasks when there is a large class imbalance.
    x is expected to contain raw, unnormalized scores for each class.
    y is expected to contain class labels.
    Shape:
        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
    """

    def __init__(self,
                 alpha: Optional[Tensor] = None,
                 gamma: float = 0.,
                 reduction: str = 'mean',
                 ignore_index: int = -100):
        """Constructor.
        Args:
            alpha (Tensor, optional): Weights for each class. Defaults to None.
            gamma (float, optional): A constant, as described in the paper.
                Defaults to 0.
            reduction (str, optional): 'mean', 'sum' or 'none'.
                Defaults to 'mean'.
            ignore_index (int, optional): class label to ignore.
                Defaults to -100.
        """
        if reduction not in ('mean', 'sum', 'none'):
            raise ValueError(
                'Reduction must be one of: "mean", "sum", "none".')

        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.reduction = reduction

        self.nll_loss = nn.NLLLoss(
            weight=alpha, reduction='none', ignore_index=ignore_index)

    def __repr__(self):
        arg_keys = ['alpha', 'gamma', 'ignore_index', 'reduction']
        arg_vals = [self.__dict__[k] for k in arg_keys]
        arg_strs = [f'{k}={v}' for k, v in zip(arg_keys, arg_vals)]
        arg_str = ', '.join(arg_strs)
        return f'{type(self).__name__}({arg_str})'

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        if x.ndim > 2:
            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
            c = x.shape[1]
            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
            y = y.view(-1)

        unignored_mask = y != self.ignore_index
        y = y[unignored_mask]
        if len(y) == 0:
            return 0.
        x = x[unignored_mask]

        # compute weighted cross entropy term: -alpha * log(pt)
        # (alpha is already part of self.nll_loss)
        log_p = F.log_softmax(x, dim=-1)
        ce = self.nll_loss(log_p, y)

        # get true class column from each row
        all_rows = torch.arange(len(x))
        log_pt = log_p[all_rows, y]

        # compute focal term: (1 - pt)^gamma
        pt = log_pt.exp()
        focal_term = (1 - pt)**self.gamma

        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
        loss = focal_term * ce

        if self.reduction == 'mean':
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()

        return loss

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_func1 = nn.CrossEntropyLoss(weight=torch.tensor(labels1_weight_list).to(device))
loss_func2 = nn.CrossEntropyLoss(weight=torch.tensor(labels2_weight_list).to(device))
# loss_func1 = FocalLoss(alpha=torch.tensor(labels1_weight_list), gamma=2, reduction='mean').to(device)
# loss_func2 = FocalLoss(alpha=torch.tensor(labels2_weight_list), gamma=2, reduction='mean').to(device)

def train(dataloader: DataLoader, model: MyModel, loss_func1: nn.CrossEntropyLoss, loss_func2: nn.CrossEntropyLoss, optimizer: torch.optim.Optimizer):
    size = len(dataloader.dataset)
    model.train()

    for i, (input_ids, attention_mask, token_type_ids, label1, label2) in enumerate(dataloader):
        input_ids, attention_mask, token_type_ids, label1, label2 = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), label1.to(device), label2.to(device)
        logits1, logits2 = model(input_ids, attention_mask, token_type_ids)
        loss1 = loss_func1(logits1, label1)
        loss2 = loss_func2(logits2, label2)
        loss = loss1 + loss2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            loss, current = loss.item(), i * len(input_ids)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [19]:
def test(dataloader: DataLoader, model: MyModel, loss_func1: nn.CrossEntropyLoss, loss_func2: nn.CrossEntropyLoss):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct1, correct2 = 0, 0, 0

    with torch.no_grad():
        for i, (input_ids, attention_mask, token_type_ids, label1, label2) in enumerate(dataloader):
            input_ids, attention_mask, token_type_ids, label1, label2 = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), label1.to(device), label2.to(device)
            logits1, logits2 = model(input_ids, attention_mask, token_type_ids)
            loss1 = loss_func1(logits1, label1)
            loss2 = loss_func2(logits2, label2)
            loss = loss1 + loss2

            test_loss += loss.item()
            correct1 += (logits1.argmax(1) == label1).type(torch.float).sum().item()
            correct2 += (logits2.argmax(1) == label2).type(torch.float).sum().item()

    test_loss /= num_batches
    correct1 /= size
    correct2 /= size
    print(f"Test Error: \n Accuracy1: {(100*correct1):>0.1f}%, Accuracy2: {(100*correct2):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [20]:
torch.cuda.is_available()

True

In [21]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(dataloader_train, model, loss_func1, loss_func2, optimizer)
    test(dataloader_test, model, loss_func1, loss_func2)
print("Done!")

Epoch 1
-------------------------------
loss: 5.744324  [    0/ 3920]
loss: 3.136272  [ 1600/ 3920]
loss: 2.001301  [ 3200/ 3920]
Test Error: 
 Accuracy1: 87.2%, Accuracy2: 65.9%, Avg loss: 1.516254 

Epoch 2
-------------------------------
loss: 1.637948  [    0/ 3920]
loss: 1.225458  [ 1600/ 3920]
loss: 1.828401  [ 3200/ 3920]
Test Error: 
 Accuracy1: 87.1%, Accuracy2: 68.9%, Avg loss: 1.367925 

Epoch 3
-------------------------------
loss: 1.220660  [    0/ 3920]
loss: 0.655014  [ 1600/ 3920]
loss: 1.196165  [ 3200/ 3920]
Test Error: 
 Accuracy1: 87.4%, Accuracy2: 70.9%, Avg loss: 1.400931 

Epoch 4
-------------------------------
loss: 0.667601  [    0/ 3920]
loss: 1.176276  [ 1600/ 3920]
loss: 0.840475  [ 3200/ 3920]
Test Error: 
 Accuracy1: 89.6%, Accuracy2: 73.4%, Avg loss: 1.272415 

Epoch 5
-------------------------------
loss: 0.348201  [    0/ 3920]
loss: 0.395658  [ 1600/ 3920]
loss: 0.095936  [ 3200/ 3920]
Test Error: 
 Accuracy1: 89.2%, Accuracy2: 74.2%, Avg loss: 1.4196

In [22]:
model = model.to("cpu")
model.eval()

MyModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [23]:
text = "乌克兰传奇舍甫琴科在伦敦发声"
logits0, logits1 = model(**tokenizer(text, return_tensors="pt"))
print(logits0.shape)
print(logits1.shape)
print(logits0)
print(logits1)

torch.Size([1, 6])
torch.Size([1, 25])
tensor([[ 6.0957, -0.6527, -1.9994, -1.2568, -1.0093, -1.6730]],
       grad_fn=<AddmmBackward0>)
tensor([[-0.8132,  1.5211,  2.2828,  2.9053, -0.0458, -0.4422, -0.6023,  6.7243,
         -0.3133, -0.9152, -0.7480, -1.3934, -0.9407, -1.0997, -0.4075,  0.5272,
         -0.1822, -0.8293,  0.6314, -0.7575, -0.3568, -0.3850, -0.2988, -1.0247,
         -0.3149]], grad_fn=<AddmmBackward0>)


In [24]:
with torch.no_grad():
    probs0 = F.softmax(logits0, dim=-1)
    probs1 = F.softmax(logits1, dim=-1)
    indexs0 = torch.argmax(probs0, dim=-1)
    indexs1 = torch.argmax(probs1, dim=-1)

    print(probs0, indexs0, labels[indexs0], probs0[:, indexs0])
    print(probs1, indexs1, labels2[indexs1], probs1[:, indexs1])

tensor([[9.9665e-01, 1.1688e-03, 3.0400e-04, 6.3884e-04, 8.1823e-04, 4.2134e-04]]) tensor([0]) 体育_CBA tensor([[0.9966]])
tensor([[5.0426e-04, 5.2049e-03, 1.1149e-02, 2.0778e-02, 1.0862e-03, 7.3075e-04,
         6.2268e-04, 9.4654e-01, 8.3132e-04, 4.5537e-04, 5.3822e-04, 2.8229e-04,
         4.4388e-04, 3.7866e-04, 7.5654e-04, 1.9266e-03, 9.4772e-04, 4.9621e-04,
         2.1382e-03, 5.3316e-04, 7.9595e-04, 7.7381e-04, 8.4345e-04, 4.0815e-04,
         8.3002e-04]]) tensor([7]) 国际足球 tensor([[0.9465]])


In [28]:
with torch.no_grad():
    probs0 = F.softmax(logits0, dim=-1)
    probs1 = F.softmax(logits1, dim=-1)

    print("probs0", torch.sort(probs0, dim=-1, descending=True)[0][:, :5])
    print("probs1", torch.sort(probs1, dim=-1, descending=True)[0][:, :5])
    print(torch.sort(probs1, dim=-1, descending=True)[0][0].detach().numpy().tolist())
    # 第二层需要乘以使用第一层的概率
    probs1 = probs1 * probs0[:, parent_index_list[1]]
    print("probs1", torch.sort(probs1, dim=-1, descending=True)[0][:, :5])
    print(torch.sort(probs1, dim=-1, descending=True)[0][0].detach().numpy().tolist())
    
    indexs0 = torch.argmax(probs0, dim=-1)
    indexs1 = torch.argmax(probs1, dim=-1)

    print(probs0, indexs0, labels[indexs0], probs0[:, indexs0])
    print(probs1, indexs1, labels2[indexs1], probs1[:, indexs1])



probs0 tensor([[9.9665e-01, 1.1688e-03, 8.1823e-04, 6.3884e-04, 4.2134e-04]])
probs1 tensor([[0.9465, 0.0208, 0.0111, 0.0052, 0.0021]])
[0.9465446472167969, 0.02077772095799446, 0.011149394325911999, 0.005204895976930857, 0.002138159703463316, 0.0019265830051153898, 0.0010862412163987756, 0.0009477174608036876, 0.000843447691295296, 0.000831316108815372, 0.000830021221190691, 0.0007959500071592629, 0.0007738128188066185, 0.0007565444684587419, 0.0007307531195692718, 0.0006226751138456166, 0.000538216030690819, 0.000533160986378789, 0.0005042566917836666, 0.0004962147795595229, 0.0004553726757876575, 0.00044388219248503447, 0.00040815354441292584, 0.00037866077036596835, 0.00028228809242136776]
probs1 tensor([[9.4337e-01, 2.0708e-02, 1.1112e-02, 5.1875e-03, 2.4992e-06]])
[0.9433725476264954, 0.02070808969438076, 0.011112029664218426, 0.005187452770769596, 2.4991861664602766e-06, 1.5763912415422965e-06, 9.701695944386302e-07, 6.901356073285569e-07, 6.331581516860751e-07, 6.23184803316689

In [None]:
import matplotlib.pyplot as plt
plt.plot(list(range(len(probs1))), probs1, ".-")
plt.show()