/
classifier.py
146 lines (133 loc) · 6.39 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from tqdm import *
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim
class Sprites(data.Dataset):
def __init__(self, path, size):
self.path = path
self.length = size
def __len__(self):
return self.length
def __getitem__(self, idx):
item = torch.load(self.path+'/%d.sprite' % (idx+1))
return item['body'], item['shirt'], item['pant'], item['hair'], item['action'], item['sprite']
class SpriteClassifier(nn.Module):
def __init__(self, n_bodies=7, n_shirts=4, n_pants=5, n_hairstyles=6, n_actions=3,
num_frames=8, in_size=64, channels=64, code_dim=1024, hidden_dim=512, nonlinearity=None):
super(SpriteClassifier, self).__init__()
nl = nn.LeakyReLU(0.2) if nonlinearity is None else nonlinearity
encoding_conv = []
encoding_conv.append(nn.Sequential(nn.Conv2d(3, channels, 5, 4, 1, bias=False), nl))
size = in_size // 4
self.num_frames = num_frames
while size > 4:
encoding_conv.append(nn.Sequential(
nn.Conv2d(channels, channels * 2, 5, 4, 1, bias=False),
nn.BatchNorm2d(channels * 2), nl))
size = size // 4
channels *= 2
self.encoding_conv = nn.Sequential(*encoding_conv)
self.final_size = size
self.final_channels = channels
self.code_dim = code_dim
self.hidden_dim = hidden_dim
self.encoding_fc = nn.Sequential(
nn.Linear(size * size * channels, code_dim),
nn.BatchNorm1d(code_dim), nl)
# The last hidden state of a convolutional LSTM over the scenes is used for classification
self.classifier_lstm = nn.LSTM(code_dim, hidden_dim, batch_first=True, bidirectional=False)
self.body = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.BatchNorm1d(hidden_dim // 2), nl,
nn.Linear(hidden_dim // 2, n_bodies))
self.shirt = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.BatchNorm1d(hidden_dim // 2), nl,
nn.Linear(hidden_dim // 2, n_shirts))
self.pants = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.BatchNorm1d(hidden_dim // 2), nl,
nn.Linear(hidden_dim // 2, n_pants))
self.hairstyles = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.BatchNorm1d(hidden_dim // 2), nl,
nn.Linear(hidden_dim // 2, n_hairstyles))
self.action = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim // 2),
nn.BatchNorm1d(hidden_dim // 2), nl,
nn.Linear(hidden_dim // 2, n_actions))
def forward(self, x):
x = x.view(-1, x.size(2), x.size(3), x.size(4))
x = self.encoding_conv(x)
x = x.view(-1, self.final_channels * (self.final_size ** 2))
x = self.encoding_fc(x)
x = x.view(-1, self.num_frames, self.code_dim)
# Classifier output depends on last layer of LSTM: Can also change this to a bi-LSTM if required
_, (hidden, _) = self.classifier_lstm(x)
hidden = hidden.view(-1, self.hidden_dim)
return self.body(hidden), self.shirt(hidden), self.pants(hidden), self.hairstyles(hidden), self.action(hidden)
def save_model(model, optim, epoch, path):
torch.save({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'optimizer': optim.state_dict()}, path)
def check_accuracy(model, test, device):
total = 0
correct_body = 0
correct_shirt = 0
correct_pant = 0
correct_hair = 0
correct_action = 0
with torch.no_grad():
for item in test:
body, shirt, pant, hair, action, image = item
image = image.to(device)
body = body.to(device)
shirt = shirt.to(device)
pant = pant.to(device)
hair = hair.to(device)
action = action.to(device)
pred_body, pred_shirt, pred_pant, pred_hair, pred_action = model(image)
_, pred_body = torch.max(pred_body.data, 1)
_, pred_shirt = torch.max(pred_shirt.data, 1)
_, pred_pant = torch.max(pred_pant.data, 1)
_, pred_hair = torch.max(pred_hair.data, 1)
_, pred_action = torch.max(pred_action.data, 1)
total += body.size(0)
correct_body += (pred_body == body).sum().item()
correct_shirt += (pred_shirt == shirt).sum().item()
correct_pant += (pred_pant == pant).sum().item()
correct_hair += (pred_hair == hair).sum().item()
correct_action += (pred_action == action).sum().item()
print('Accuracy, Body : {} Shirt : {} Pant : {} Hair : {} Action {}'.format(correct_body/total, correct_shirt/total, correct_pant/total, correct_hair/total, correct_action/total))
def train_classifier(model, optim, dataset, device, epochs, path, test, start=0):
model.train()
criterion = nn.CrossEntropyLoss()
for epoch in range(start, epochs):
running_loss = 0.0
for i, item in tqdm(enumerate(dataset, 1)):
body, shirt, pant, hair, action, image = item
image = image.to(device)
body = body.to(device)
shirt = shirt.to(device)
pant = pant.to(device)
hair = hair.to(device)
action = action.to(device)
pred_body, pred_shirt, pred_pant, pred_hair, pred_action = model(image)
loss = criterion(pred_body, body) + criterion(pred_shirt, shirt) + criterion(pred_pant, pant) + criterion(pred_hair, hair) + criterion(pred_action, action)
loss.backward()
optim.step()
running_loss += loss.item()
print('Epoch {} Avg Loss {}'.format(epoch + 1, running_loss / i))
save_model(model, optim, epoch, path)
check_accuracy(model, test, device)
device = torch.device('cuda:0')
model = SpriteClassifier()
model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=0.0003)
sprites_train = Sprites('./dataset/lpc-dataset/train', 6759)
sprites_test = Sprites('./dataset/lpc-dataset/test', 801)
loader = data.DataLoader(sprites_train, batch_size=32, shuffle=True, num_workers=4)
loader_test = data.DataLoader(sprites_test, batch_size=64, shuffle=True, num_workers=4)
train_classifier(model, optim, loader, device, 20, './checkpoint_classifier.pth', loader_test)