In [1]:
import numpy as np
import matplotlib.pyplot as plt

# numpy基础
数据处理库。核心是多维数组NDarray
底层使用C实现，效率高。

## NDArray
numpy中最重要的类。多维数据包装器

多维数组的创建、修改以及多维数组的属性

### 创建
从数组创建或使用内置函数

In [2]:
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.zeros([2, 2, 3], dtype = np.float32)
arr3 = np.ones([2, 2])
print(arr1, type(arr1))
print(arr2)
print(arr3)

[1 2 3 4 5] <class 'numpy.ndarray'>
[[[0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]]]
[[1. 1.]
 [1. 1.]]


### 修改
主要包括维度变换和拼接

In [3]:
arr1 = np.zeros([1, 1, 4],  dtype=np.int32)
arr2 = np.ones([5, 1, 4], dtype=np.int32)
arr = np.concatenate([arr1, arr2], axis=0)
print('concatenate:', arr)

print('expand1:', np.expand_dims(arr1, 3))
print('expand2', arr2[ :, :, np.newaxis, :])

concatenate: [[[0 0 0 0]]

 [[1 1 1 1]]

 [[1 1 1 1]]

 [[1 1 1 1]]

 [[1 1 1 1]]

 [[1 1 1 1]]]
expand1: [[[[0]
   [0]
   [0]
   [0]]]]
expand2 [[[[1 1 1 1]]]


 [[[1 1 1 1]]]


 [[[1 1 1 1]]]


 [[[1 1 1 1]]]


 [[[1 1 1 1]]]]


### 获取数组属性

In [4]:
arr = np.array([[1, 1, 4, 5, 1, 4], [1, 9, 1, 9, 8, 10]])
print('size:', arr.size)
print('shape:', arr.shape)

size: 12
shape: (2, 6)


### 切片和筛选
切片和python list一致



In [5]:
arr = np.array([[1, 1, 4, 5, 1], 
                [4, 1, 9, 1, 9],
                [8, 1, 0, 1, 1], 
                [4, 5, 1, 4, 1], 
                [9, 1, 9, 8, 10]])

print('single choose')
print(arr[1])
print(arr[1, 0])
print(arr[[1, 0], [2, 3]])

print('slice')
print(arr[:2, :3])

print('filter')
print(arr>5)
print(arr[arr>7])



single choose
[4 1 9 1 9]
4
[9 5]
slice
[[1 1 4]
 [4 1 9]]
filter
[[False False False False False]
 [False False  True False  True]
 [ True False False False False]
 [False False False False False]
 [ True False  True  True  True]]
[ 9  9  8  9  9  8 10]


### np.where
按条件选择、替换数据

In [6]:
condition = arr>5
print(np.where(condition, -1, arr))
print(np.where(condition, -1, 2))
rra = -arr
print(np.where(condition, arr, rra))

[[ 1  1  4  5  1]
 [ 4  1 -1  1 -1]
 [-1  1  0  1  1]
 [ 4  5  1  4  1]
 [-1  1 -1 -1 -1]]
[[ 2  2  2  2  2]
 [ 2  2 -1  2 -1]
 [-1  2  2  2  2]
 [ 2  2  2  2  2]
 [-1  2 -1 -1 -1]]
[[-1 -1 -4 -5 -1]
 [-4 -1  9 -1  9]
 [ 8 -1  0 -1 -1]
 [-4 -5 -1 -4 -1]
 [ 9 -1  9  8 10]]


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
import json

tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
tokenizer.enable_padding(length=256)
# print(tokenizer.encode("Hace a nice Day!").ids)

class MyDataSet(Dataset):
    def __init__(self, file: str):
        self.data = []
        self.label = []
        with open(file, "r", encoding='utf-8') as fin:
            for line in fin:
                tmp_dict = json.loads(line)
                self.data.append(torch.tensor(tokenizer.encode(tmp_dict["content"]).ids[:256]))
                self.label.append(torch.tensor([1-tmp_dict["label"], tmp_dict['label']], dtype=torch.float32))
                
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

train_set = MyDataSet(file="dataset/train.jsonl")
test_set = MyDataSet(file="dataset/test.jsonl")




In [16]:
train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=32, shuffle=True)

for data, label in train_loader:
    print(data)
    print(label)
    break

tensor([[ 258,  931,  141,  ...,    0,    0,    0],
        [ 209, 3923,  218,  ...,    0,    0,    0],
        [  44,  883, 3670,  ...,    0,    0,    0],
        ...,
        [  44, 1265,  142,  ...,    0,    0,    0],
        [ 949,   15, 2650,  ...,    0,    0,    0],
        [ 258,  141,  379,  ...,    0,    0,    0]])
tensor([[0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.]])


In [23]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=50000, embedding_dim=64)
        self.layer1 = nn.Linear(256*64, 64*128)
        self.ac1 = nn.ReLU()
        self.layer2 = nn.Linear(64*128, 16*16)
        self.ac2 = nn.ReLU()
        self.out = nn.Linear(16*16, 2)
        
    def forward(self, data):
        hidden = self.emb(data).view(-1, 64*256)
        return self.out(self.ac2(self.layer2(self.ac1(self.layer1(hidden)))))
    
    
model = MyModel()
print(model(data))

tensor([[ 0.0640, -0.0771],
        [-0.0466, -0.0354],
        [-0.0145, -0.0097],
        [ 0.0218, -0.0655],
        [-0.0348, -0.0343],
        [-0.0475, -0.1092],
        [ 0.0170, -0.0862],
        [ 0.0439,  0.0006],
        [ 0.0257, -0.0995],
        [ 0.0675, -0.0755],
        [-0.0783, -0.0653],
        [-0.0444, -0.0672],
        [ 0.0150, -0.0273],
        [-0.0671,  0.0360],
        [ 0.0340, -0.0917],
        [ 0.0485, -0.0746],
        [-0.0308, -0.0914],
        [ 0.0712,  0.0007],
        [ 0.0393,  0.1223],
        [ 0.0370, -0.1239],
        [ 0.1803, -0.1560],
        [-0.0515,  0.0744],
        [ 0.0764, -0.0816],
        [ 0.1007, -0.0728],
        [ 0.0376, -0.0593],
        [-0.0300, -0.1615],
        [ 0.0355,  0.0056],
        [-0.0703, -0.1516],
        [-0.0520, -0.1018],
        [ 0.0698, -0.0019],
        [ 0.0889, -0.0116],
        [-0.0045, -0.0628]], grad_fn=<AddmmBackward0>)


In [24]:
from torch.optim import SGD
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=1e-3)

In [26]:
import wandb
import numpy as np
wandb.finish()
wandb.init(
    # set the wandb project where this run will be logged
    project="summer_guide",
    # track hyperparameters and run metadata
    config={
        "learning_rate": 1e-3,
        "architecture": "MLP",
        "dataset": "amazon-plarity",
        "epochs": 1,
    },
)
model.cuda()
for i in range(20):
    for batch, (X, y) in enumerate(train_loader):
        pred = model(X.cuda())
        loss = loss_fn(pred, y.cuda())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 20 == 0:
            wandb.log(
                {
                    "loss": loss,
                    "acc": np.mean((torch.argmax(pred.cpu(), 1) == torch.argmax(y.cpu(), 1)).numpy()),
                }
            )
    torch.save(model, "result/model.pt")

VBox(children=(Label(value='0.001 MB of 0.026 MB uploaded\r'), FloatProgress(value=0.05086432494194833, max=1.…

0,1
acc,▃▆▄▅▄▁▇▃▃▅▄▅▅▃▃▃█▅▃▄▅▄▅▅▅▅▅▁▄▃▇▅▅▅▅▅▅▅▅▃
loss,▅▄▅▄▄▆▃▅▄▄▃▅▄▅▅▆▂▃▅▅▃▄▄▄▄▃▃▇▄█▃▄▄▄▄▁▅▃▄▆

0,1
acc,0.65625
loss,0.63184


In [None]:
torch.save(model, "result/model.pt")

In [None]:
wandb.finish()

0,1
loss,▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,0.6948


In [None]:
print(torch.cuda.is_available())

False
