Так как очень много работ связано непосредственно с изображениями, то у нас зачастую бывает что изображений либо мало, либо мы их должны как-то преобразовывать, либо мы хотим как-то расширить наш датасет. Тогда мы можем воспользоваться методами transforms, реализованным в библиотеке torchvision.

Какие у него особенности?

Не со всеми форматами изображений он работает. Нам нужно преобразовывать наши изображения в формат изображений библиотеки PIL
На прошлом занятии мы затрагивали тему transforms. Брали от туда метод ToTensor, благодаря которому мы могли перевести изображения в тензорный формат. И т.о. мы могли обучать на этих тезорах нашу модель.

https://pytorch.org/docs/stable/torchvision/transforms.html

<h1><center>Часть 2. Практика</center></h1>

In [1]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import matplotlib.pyplot as plt
import torchvision
from torchvision import datasets
from torch.utils.data import DataLoader, Dataset
from torch import optim
import time
import math

use_cuda = torch.cuda.is_available()
# device = torch.device("cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


Для реализации LSTM далеко ходить не надо. Все обернуто в объект [LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html):

[лстм от пайторча](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html)

```torch.nn.LSTM(input_dim, hidden_dim,layer_num,batch_first=True)```

In [2]:
class LSTM_net(nn.Module):
    # Для того, что бы инициализировать LSTM нам нужно указать:
    # input_dim - размерность входного тензора. тензор входит в формате (seq_len, batch, input_dim)
    # (batch_size, seq, inp_dim) - if batch_first=True
    # hidden_dim - размерность вектора состояния h
    # output_dim - размерность выхода
    # layer_num - количество скрытых слоев в сети
    def __init__(self,input_dim,hidden_dim,output_dim,layer_num):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim,layer_num,batch_first=True)
        self.dr = torch.nn.Dropout2d(0.1)
        self.fc = torch.nn.Linear(hidden_dim,output_dim)
        
        
    def forward(self,inputs):
        x = inputs
        lstm_out,(hn,cn) = self.lstm(x)
        out = self.fc(lstm_out[:,-1,:])
        return out

In [3]:
skeletons = pd.read_csv("skeletons_classes_1_30.csv")

In [4]:
LABELS = {
    0: "brushing teeth", 
    1: "throw", 
    2: "kicking something", 
    3: "salute", 
    4: "jump up",
    5: "taking a selfie",
    6: "flick hair",
    7: "hugging other person"}

In [5]:
skeletons.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3366,3367,3368,3369,3370,3371,3372,3373,3374,labels
0,0.166643,0.169332,3.764293,0.182184,0.430023,3.690497,0.196409,0.685314,3.606696,0.157366,...,-0.021367,0.162484,3.878167,0.128432,0.729048,3.451543,0.130322,0.707719,3.4028,0
1,0.156408,0.167995,3.780114,0.17407,0.428376,3.704427,0.189149,0.683577,3.618258,0.143667,...,0.020917,0.141005,3.853444,0.171469,0.732059,3.466049,0.222879,0.696383,3.413667,0
2,0.237631,0.235039,3.746806,0.246957,0.468998,3.670609,0.255474,0.698446,3.586321,0.227848,...,-0.285577,0.347618,3.335,0.394457,0.714967,3.476677,0.360391,0.677145,3.4515,1
3,0.286811,0.168139,3.734849,0.288774,0.415689,3.650531,0.290175,0.658193,3.55711,0.260253,...,-0.051341,0.1836,3.872833,0.485147,0.069447,3.672266,0.498374,0.109445,3.626555,2
4,0.284738,0.17868,3.764987,0.299845,0.428857,3.687624,0.314031,0.673921,3.600093,0.296217,...,-0.023884,1.05979,3.3765,0.302396,1.042522,3.275737,0.326857,0.969929,3.2595,3


In [6]:
skeletons.shape

(949, 3376)

In [7]:
chonk_len = 45

## Dataloader

In [8]:
class Skeleton_Dataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path)
        self.transform = transform
        self.labels = self.data.iloc[:,-1]


    def __len__(self):
        return len(self.data) 
    
    
    def __getitem__(self, idx):
        item = np.asarray(self.data.iloc[idx,:-1]).reshape(chonk_len,25*3)
        label = self.labels[idx]
        if self.transform != None:
            item = transform(item)
            
        

        return (item, label) 

In [9]:
dataset = Skeleton_Dataset(file_path = "skeletons_classes_1_30.csv", transform=None)

# Посмотрим что получилось

In [10]:
skel, lab = dataset.__getitem__(41)
skel, lab = dataset[41]

In [11]:
LABELS[lab]

'throw'

In [12]:
skel.shape

(45, 75)

In [13]:
skels_for_draw = skel.reshape(-1,25,3)

#### Немного визуализации данных:

In [14]:
from mpl_toolkits.mplot3d import Axes3D  
import mpl_toolkits.mplot3d as plt3d
import matplotlib.animation as animation

import matplotlib.pyplot as plt
import numpy as np
from time import sleep

%matplotlib notebook


bone_pairs = (
    (1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
    (7, 6), (8, 7), (9, 21), (10, 9), (11, 10), (12, 11),
    (13, 1), (14, 13), (15, 14), (16, 15), (17, 1), (18, 17),
    (19, 18), (20, 19), (22, 23), (21, 21), (23, 8), (24, 25),(25, 12)
)

i = 0
# for for_draw in skels_for_draw:
i = i + 1
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for_draw = skels_for_draw[10]

for pair in bone_pairs:
    xs = for_draw[pair[0]-1][0], for_draw[pair[1]-1][0]
    ys = for_draw[pair[0]-1][1], for_draw[pair[1]-1][1]
    zs = for_draw[pair[0]-1][2], for_draw[pair[1]-1][2]

    line = plt3d.art3d.Line3D(zs, xs, ys)
    ax.add_line(line)


for x,y,z in for_draw:
    ax.scatter(z, x, y, color='black', marker='s')

    ax.set_ylim((-0.3, 0.3))
    ax.set_zlim((-1.0, 0.25))
    ax.set_xlim((3.0, 4.0))


    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')


    plt.show()

#     plt.savefig('pics_1/test_' +str(i)+'.png', bbox_inches='tight')
#     plt.clf()

<IPython.core.display.Javascript object>

#### Оборачиваем в Даталоадер:

In [15]:
dataset

<__main__.Skeleton_Dataset at 0x178f4cf5d30>

In [16]:
sum((int(0.75*len(dataset)),int(0.25*len(dataset))))

948

In [17]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset))+1,int(0.25*len(dataset))])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)

Инициализируем наши переменные и модель:

In [18]:
n_hidden = 128
n_joints = 25*3
n_categories = len(LABELS)
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

LSTM_net(
  (lstm): LSTM(75, 128, num_layers=2, batch_first=True)
  (dr): Dropout2d(p=0.1, inplace=False)
  (fc): Linear(in_features=128, out_features=8, bias=True)
)

In [19]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
#     print(output.topk(5))
    return LABELS[category_i], category_i

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [20]:
from  tqdm.notebook import tqdm

In [21]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
epochs = 600
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in tqdm(range(epochs)):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0

  0%|          | 0/600 [00:00<?, ?it/s]

epoch : 0 iter : 0 (0m 0s) 2.0589  / brushing teeth ✗ (salute)
epoch : 11 iter : 5 (0m 13s) 1.9815  / brushing teeth ✗ (taking a selfie)
epoch : 22 iter : 10 (0m 29s) 1.9451  / brushing teeth ✗ (jump up)
epoch : 33 iter : 15 (0m 43s) 1.9284  / brushing teeth ✗ (kicking something)
epoch : 44 iter : 20 (0m 57s) 1.8755  / brushing teeth ✗ (throw)
epoch : 55 iter : 25 (1m 11s) 1.6481  / kicking something ✗ (salute)
epoch : 66 iter : 30 (1m 25s) 2.3928  / salute ✗ (jump up)
epoch : 77 iter : 35 (1m 39s) 1.5646  / flick hair ✓
epoch : 88 iter : 40 (1m 54s) 1.4962  / brushing teeth ✗ (jump up)
epoch : 100 iter : 0 (2m 12s) 1.2451  / throw ✗ (jump up)
epoch : 111 iter : 5 (2m 26s) 1.1803  / kicking something ✓
epoch : 122 iter : 10 (2m 39s) 1.4107  / brushing teeth ✗ (flick hair)
epoch : 133 iter : 15 (2m 52s) 1.0984  / kicking something ✓
epoch : 144 iter : 20 (3m 6s) 1.4630  / kicking something ✗ (salute)
epoch : 155 iter : 25 (3m 19s) 0.9810  / jump up ✗ (brushing teeth)
epoch : 166 iter : 

In [22]:
%matplotlib notebook
plt.plot(list(range(0,len(all_losses))),all_losses)
plt.show()

<IPython.core.display.Javascript object>

In [23]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   62.869198312236286


In [24]:
with open('summury.csv','a') as f:
    f.writelines(f'{chonk_len},{n_layer},{100 * right / counter} \n')

In [25]:
names = ['chonk_len', 'leyers', 'accuracy']
df = pd.read_csv('summury.csv',names=names)

In [31]:
df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,chonk_len,leyers,accuracy
2,45,1,83.544304
0,45,2,80.168776
5,60,2,78.021978
4,30,2,70.833333
3,45,2,70.042194
1,45,3,67.932489
6,45,2,62.869198


В зависимости от попытки, мы получаем кординально разную точность. Учитывая что при chonk_len = 45 и layers = 2 мы поулчаем 3 совершенно разных результата. То в данной ситуации мы не можем судить какие параметры дают лучшие результаты. Нужно проводить слишком много экспериментов, так как дисперсия точности слишком большая.