In [None]:
import numpy as np
import matplotlib.pyplot as plt

# numpy基础
数据处理库。核心是多维数组NDarray
底层使用C实现，效率高。

## NDArray
numpy中最重要的类。多维数据包装器

多维数组的创建、修改以及多维数组的属性

### 创建
从数组创建或使用内置函数

In [None]:
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.zeros([2, 2, 3], dtype = np.float32)
arr3 = np.ones([2, 2])
print(arr1, type(arr1))
print(arr2)
print(arr3)

### 修改
主要包括维度变换和拼接

In [None]:
arr1 = np.zeros([1, 1, 4],  dtype=np.int32)
arr2 = np.ones([5, 1, 4], dtype=np.int32)
arr = np.concatenate([arr1, arr2], axis=0)
print('concatenate:', arr)

print('expand1:', np.expand_dims(arr1, 3))
print('expand2', arr2[ :, :, np.newaxis, :])

### 获取数组属性

In [None]:
arr = np.array([[1, 1, 4, 5, 1, 4], [1, 9, 1, 9, 8, 10]])
print('size:', arr.size)
print('shape:', arr.shape)

### 切片和筛选
切片和python list一致



In [None]:
arr = np.array([[1, 1, 4, 5, 1], 
                [4, 1, 9, 1, 9],
                [8, 1, 0, 1, 1], 
                [4, 5, 1, 4, 1], 
                [9, 1, 9, 8, 10]])

print('single choose')
print(arr[1])
print(arr[1, 0])
print(arr[[1, 0], [2, 3]])

print('slice')
print(arr[:2, :3])

print('filter')
print(arr>5)
print(arr[arr>7])



### np.where
按条件选择、替换数据

In [None]:
condition = arr>5
print(np.where(condition, -1, arr))
print(np.where(condition, -1, 2))
rra = -arr
print(np.where(condition, arr, rra))

# 数据保存和加载

In [None]:
# 加载txt格式的数据

In [None]:
import torch


class TestModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(5, 2)

    def forward(self, data):
        return(self.layer(data))


test_model = TestModel()
data = torch.rand((1, 5))
y = test_model(data)

optimizer = torch.optim.SGD(test_model.parameters(), lr = 1)
loss_fn = torch.nn.MSELoss()

print("-------------------\nbefore loss backward:\n")
print(test_model.layer._parameters["weight"])
print(test_model.layer._parameters["weight"].grad)

loss = loss_fn(torch.randn((1,2)), y)
loss.backward()

print("-------------------\nafter loss backward:\n")
print(test_model.layer._parameters["weight"])
print(test_model.layer._parameters["weight"].grad)
print(test_model.layer._parameters["weight"].grad.shape)

print(loss.grad_fn.next_functions)

optimizer.step()

print("-------------------\nafter optimize:\n")

print(test_model.layer._parameters["weight"])
print(test_model.layer._parameters["weight"].grad)
print(test_model.layer._parameters["weight"].grad.shape)

optimizer.zero_grad()
print("-------------------\nafter zerograd:\n")

print(test_model.layer._parameters["weight"])
print(test_model.layer._parameters["weight"].grad)


with torch.no_grad():
    loss = loss_fn(torch.randn((1, 2)), y)
    print(loss.grad_fn)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
import json

tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
tokenizer.enable_padding(length=256)
# print(tokenizer.encode("Hace a nice Day!").ids)

class MyDataSet(Dataset):
    def __init__(self, file: str):
        self.data = []
        self.label = []
        with open(file, "r", encoding='utf-8') as fin:
            for line in fin:
                tmp_dict = json.loads(line)
                self.data.append(torch.tensor(tokenizer.encode(tmp_dict["content"]).ids[:256]))
                self.label.append(torch.tensor([1-tmp_dict["label"], tmp_dict['label']], dtype=torch.float32))
                
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

train_set = MyDataSet(file="dataset/train.jsonl")
test_set = MyDataSet(file="dataset/test.jsonl")




In [None]:
train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=32, shuffle=True)

for data, label in train_loader:
    print(data)
    print(label)
    break

In [None]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=50000, embedding_dim=64)
        self.layer1 = nn.Linear(256*64, 64*128)
        self.ac1 = nn.ReLU()
        self.layer2 = nn.Linear(64*128, 16*16)
        self.ac2 = nn.ReLU()
        self.out = nn.Linear(16*16, 2)
        
    def forward(self, data):
        hidden = self.emb(data).view(-1, 64*256)
        return self.out(self.ac2(self.layer2(self.ac1(self.layer1(hidden)))))
    
    
model = MyModel()
# print(model(data))

In [None]:
from torch.optim import SGD
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=1e-3)

In [None]:
import wandb
import numpy as np
wandb.finish()
wandb.init(
    # set the wandb project where this run will be logged
    project="summer_guide",
    # track hyperparameters and run metadata
    config={
        "learning_rate": 1e-3,
        "architecture": "MLP",
        "dataset": "amazon-plarity",
        "epochs": 1,
    },
)
model.cuda()
for i in range(20):
    for batch, (X, y) in enumerate(train_loader):
        pred = model(X.cuda())
        loss = loss_fn(pred, y.cuda())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 20 == 0:
            wandb.log(
                {
                    "loss": loss,
                    "acc": np.mean((torch.argmax(pred.cpu(), 1) == torch.argmax(y.cpu(), 1)).numpy()),
                }
            )
    torch.save(model, "result/model.pt")

In [None]:
torch.save(model, "result/model.pt")

In [None]:
wandb.finish()

In [None]:
print(torch.cuda.is_available())