In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [2]:
batch_size = 128
epoch = 20
# tensorboard_writer = SummaryWriter()
T_0 =3
T_mult=2
eta_min = 1e-6
epsilon = 0.01
base_matrix = [[[ 2, 2 ],[2,   3 ]]]
inv_base_matrix = [[[ 1.5, -1. ],[-1.,   1. ]]]

In [3]:


class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.fc1 = nn.Linear(4,8)
        self.fc3 = nn.Linear(8, 4)
        
    def forward(self, x):
        x = x.view(-1, 4)   # reshape Variable
        x = F.relu(self.fc1(x))
#         x = F.dropout(x, 0.1)
        x = self.fc3(x)
        return x
    
model = BaseModel()
model = model.to(torch.double)
model = model.to('cuda') 
model.train()
model

BaseModel(
  (fc1): Linear(in_features=4, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=4, bias=True)
)

In [4]:
class CustomDataset(Dataset):

    def __init__(self, root_dir):
        self.dataset = np.load(root_dir)
        print('number of data points', self.dataset.shape[0])

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x = self.dataset[idx, :,:,0]
        y = self.dataset[idx, :,:,1]
        return x,y

In [5]:
train_set = CustomDataset('train_set.npy')
val_set = CustomDataset('val_set.npy')
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=batch_size)

number of data points 1000000
number of data points 10000


In [6]:
optimizer = optim.Adam(model.parameters(), lr=5e-5, weight_decay=1e-7)
scheduler = CosineAnnealingWarmRestarts(optimizer,T_0,T_mult,eta_min)

In [7]:

train_accu = []
i = 1
for epoch in range(epoch):
    for data, target in train_loader:
        target = target - torch.tensor(inv_base_matrix)
#         print(target)
        target = target/epsilon
        data, target = Variable(data), Variable(target)
        data = data.to('cuda')
        target = target.to('cuda')
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, target.view(-1,4))
        loss.backward()
        
        mse_loss = loss.item()
        optimizer.step()
        
#         if i % 10 == 0:
#             tensorboard_writer.add_scalar("Loss/step", loss, i)
        if i % 100 == 0:
            print('\rTrain Step: %d, Loss: %.4f, lr: %.8f'%(i, mse_loss, scheduler.get_lr()[0]), end="")
        i += 1
    scheduler.step()
    print('\n')
torch.save(model.state_dict(), 'small_3.pth')



Train Step: 7800, Loss: 0.5427, lr: 0.00005000

Train Step: 15600, Loss: 0.0551, lr: 0.00003775

Train Step: 23400, Loss: 0.0237, lr: 0.00001325

Train Step: 31200, Loss: 0.0101, lr: 0.00005000

Train Step: 39000, Loss: 0.0021, lr: 0.00004672

Train Step: 46800, Loss: 0.0002, lr: 0.00003775

Train Step: 54600, Loss: 0.0002, lr: 0.00002550

Train Step: 62500, Loss: 0.0001, lr: 0.00001325

Train Step: 70300, Loss: 0.0001, lr: 0.00000428

Train Step: 78100, Loss: 0.0002, lr: 0.00005000

Train Step: 85900, Loss: 0.0001, lr: 0.00004917

Train Step: 93700, Loss: 0.0001, lr: 0.00004672

Train Step: 101500, Loss: 0.0001, lr: 0.00004282

Train Step: 109300, Loss: 0.0001, lr: 0.00003775

Train Step: 117100, Loss: 0.0001, lr: 0.00003184

Train Step: 125000, Loss: 0.0001, lr: 0.00002550

Train Step: 132800, Loss: 0.0001, lr: 0.00001916

Train Step: 140600, Loss: 0.0001, lr: 0.00001325

Train Step: 148400, Loss: 0.0001, lr: 0.00000818

Train Step: 156200, Loss: 0.0001, lr: 0.00000428



In [8]:
# analyse a11
# target: 1.5- 2.25a+ 1.5b+ 1.5c- d
from tqdm import tqdm
datapoint=1
total_error = 0
total_number = 0 
for data, target in tqdm(val_loader):
    target = target - torch.tensor(inv_base_matrix)
    target = target/epsilon
    
    data, target = Variable(data), Variable(target)
#     data = data*epsilon
    data = data.to('cuda')
#     target = target.to('cuda')
    output = model(data)
    output = output.detach().to('cpu')
#     output = output/epsilon
    total_error += torch.sum(torch.abs(output[:,:] - target.view(-1,4)[:,:]))*epsilon
    total_number += output.shape[0]*output.shape[1]

print(total_error.numpy()/total_number)

100%|██████████| 79/79 [00:00<00:00, 421.37it/s]

6.788916144246224e-05





In [8]:
# analyse a11
# target: 1.5- 2.25a+ 1.5b+ 1.5c- d
from tqdm import tqdm
datapoint=1
total_error = 0
total_number = 0 
for data, target in tqdm(val_loader):
    target = target - torch.tensor(inv_base_matrix)
    target = target/epsilon
    
    data, target = Variable(data), Variable(target)
#     data = data*epsilon
    data = data.to('cuda')
#     target = target.to('cuda')
    output = model(data)
    output = output.detach().to('cpu')
#     output = output/epsilon
    total_error += torch.sum(torch.abs(output[:,:] - target.view(-1,4)[:,:]))*epsilon
    total_number += output.shape[0]*output.shape[1]

print(total_error.numpy()/total_number)

100%|██████████| 79/79 [00:01<00:00, 63.17it/s]

7.093920595790846e-05





In [7]:
model.load_state_dict(torch.load('small_1.pth'))
list(model.parameters())

[Parameter containing:
 tensor([[ 2.9422e-17, -1.0965e-16, -1.0821e-16, -2.9642e-16],
         [ 9.5628e-02,  2.9369e-01,  5.3154e-02, -4.4647e-01],
         [ 2.5575e-18, -5.1115e-18,  3.6952e-16,  5.6182e-16],
         [ 1.5360e+00, -1.2078e+00, -7.5969e-01,  6.3958e-01],
         [-3.6153e-01, -4.7281e-02,  4.6443e-01, -3.2432e-01],
         [ 3.5525e-01, -1.8960e-01, -6.1029e-01,  1.3732e-01],
         [-1.2802e+00,  1.0202e+00,  6.6826e-01, -5.6571e-01],
         [-6.5787e-01,  4.8745e-01,  2.9334e-01, -2.2843e-01]], device='cuda:0',
        dtype=torch.float64, requires_grad=True),
 Parameter containing:
 tensor([-6.6458e-15,  7.9882e-01, -7.6495e-15, -2.4201e-01,  1.0407e+00,
          1.2244e+00,  1.9913e-01,  9.9282e-02], device='cuda:0',
        dtype=torch.float64, requires_grad=True),
 Parameter containing:
 tensor([[ 1.6707e-17, -7.5081e-02,  6.4202e-17, -1.1650e+00,  5.8385e-01,
          -5.0147e-01,  1.1485e+00,  6.6387e-01],
         [ 5.9550e-19, -8.2578e-01, -8.3030e

In [8]:
W1 = model.fc1.weight.data.to('cpu').numpy()
b1 = model.fc1.bias.data.to('cpu').numpy()
W2 = model.fc3.weight.data.to('cpu').numpy()
b2 = model.fc3.bias.data.to('cpu').numpy()

In [107]:
# analyse a11
# target: 1.5- 2.25a+ 1.5b+ 1.5c- d
datapoint=1

for data, target in tqdm(val_loader):
    target = target - torch.tensor(inv_base_matrix)
    target = target/epsilon
    break
    
input_temp = data.view(-1,4)[datapoint,:].to('cpu')
print(input_temp)
target_temp = target[datapoint,:].to('cpu')
print(target_temp)
# a1_matrix = W1*W2[[0],:].T
# a1_bias = W2[0,:]*b1
a1_matrix = W1
a1_bias = b1
# print(a1_matrix)
# print(a1_bias)
sums = 0
linear_temp=np.zeros((8,4),dtype=np.double)
bias_temp=np.zeros((8,),dtype=np.double)
sgn_w2_temp = np.zeros((8,),dtype=np.double)

linear_current = [0,0,0,0]
bias_current = 0
for i in range(8):
#     print('W2', W2[0,i])
    sgn_w2 = int(W2[0,i]>0)*2-1
    abs_w2 = np.abs(W2[0,i])
    print('###')
    print(sgn_w2,abs_w2)
    print(a1_matrix[i,0]*abs_w2,'*a+', \
          a1_matrix[i,1]*abs_w2,'*b+', \
          a1_matrix[i,2]*abs_w2,'*c+', \
          a1_matrix[i,3]*abs_w2,'*d+', \
          b1[i]*abs_w2
         )
    temp = a1_bias[i]*abs_w2
    for j in range(4):
        temp = temp+ a1_matrix[i,j]*abs_w2*input_temp[j]
        linear_temp[i,j] += (sgn_w2*a1_matrix[i,j]*abs_w2)
    bias_temp[i] += sgn_w2*a1_bias[i]*abs_w2
    sgn_w2_temp[i]=sgn_w2
    if temp>0:
        print('Used', sgn_w2)
        for j in range(4):
            linear_current[j] += W2[0,i]*a1_matrix[i,j]
        bias_current += W2[0,i]*a1_bias[i]
        sums =sums + sgn_w2*temp
#     print('temp',temp)
    
print(linear_temp, bias_temp, sgn_w2_temp)
print(linear_current, bias_current, b2[0])
print(sums+b2[0])
print(-2.25*input_temp[0]+1.5*input_temp[1]+1.5*input_temp[2]-1*input_temp[3])

  0%|          | 0/79 [00:00<?, ?it/s]

tensor([-0.3969, -0.5167, -0.0804, -0.5720], dtype=torch.float64)
tensor([[ 0.5712, -0.3128],
        [-0.5321,  0.3730]], dtype=torch.float64)
###
1 1.670690992152575e-17
4.915572246777594e-34 *a+ -1.831944911715188e-33 *b+ -1.8078282680332953e-33 *c+ -4.952219084900313e-33 *d+ -1.1103024606218899e-31
###
-1 0.07508124925236073
0.007179844582235177 *a+ 0.022050303803059144 *b+ 0.003990901072310622 *c+ -0.033521817800796105 *d+ 0.059976562767422956
Used -1
###
1 6.420168606299503e-17
1.6419448795017974e-34 *a+ -3.2816858823977e-34 *b+ 2.3723589388224325e-32 *c+ 3.606988647259131e-32 *d+ -4.911083563715734e-31
###
-1 1.1650442957350466
1.7895587297714999 *a+ -1.4071821670301847 *b+ -0.8850722040309135 *c+ 0.7451388139614773 *d+ -0.2819515753510457
###
1 0.5838537319640935
-0.21107804778365882 *a+ -0.027605063555745754 *b+ 0.2711591315699631 *c+ -0.18935280719842532 *d+ 0.6076357358274985
Used 1
###
-1 0.5014662180337854
0.17814358519528556 *a+ -0.09507804949622449 *b+ -0.306039538100790




In [65]:
# analyse a11
# target: 1.5- 2.25a+ 1.5b+ 1.5c- d
from tqdm import tqdm
statistics = {}
for data, target in tqdm(train_loader):
    for datapoint in range(data.shape[0]):
        input_temp = data.view(-1,4)[datapoint,:].to('cpu')
        target_temp = target.view(-1,4)[datapoint,:].to('cpu')
    
        a1_matrix = W1
        a1_bias = b1
        sums = 0
        linear_temp=[0,0,0,0]
        bias_temp=0
        temp_arr = ''
        for i in range(8):
            sgn_w2 = int(W2[0,i]>0)*2-1
            abs_w2 = np.abs(W2[0,i])
            temp = a1_bias[i]*abs_w2
            for j in range(4):
                temp = temp+ a1_matrix[i,j]*abs_w2*input_temp[j]
            if temp>0:
                temp_arr+=chr(ord('0')+i)
        if temp_arr not in statistics:
            statistics[temp_arr]=0
        statistics[temp_arr]+=1
print(statistics)
#     print('temp',temp)
# print(sums+b2[0])
# print(linear_temp, bias_temp+b2[0])
# print(-2.25*input_temp[0]+1.5*input_temp[1]+1.5*input_temp[2]-1*input_temp[3])

100%|██████████| 7813/7813 [11:13<00:00, 11.59it/s]

{'1345': 417215, '14567': 556714, '1456': 9173, '13457': 8792, '13456': 2602, '135': 622, '1457': 4401, '145': 189, '345': 246, '134567': 21, '1467': 13, '4567': 3, '3457': 7, '457': 2}





In [66]:
{'1345': 417215, '14567': 556714}

[-0.23846846  0.12372661  0.05511967 -0.56574118]


In [85]:
#-2.25 +1.5 +1.5 -1
#linear_temp, bias_temp, sgn_w2_temp

np.sum(np.expand_dims(sgn_w2_temp[[1,4,5,6,7]], axis=1)*linear_temp[[1,4,5,6,7],:],axis=0)

array([-1.93277397,  1.3947055 ,  0.9313421 , -0.95536045])

In [106]:
0.215623079807177 -0.2384684574680495

-0.022845377660872523

In [21]:
# analyse a11
# target: 1.5- 2.25a+ 1.5b+ 1.5c- d
from tqdm import tqdm
datapoint=0

for data, target in tqdm(val_loader):
    target = target - torch.tensor(inv_base_matrix)
    target = target/epsilon
    break
    
input_temp = data.view(-1,4)[datapoint,:].to('cpu')
print(input_temp)
target_temp = target[datapoint,:].to('cpu')
print(target_temp)
# a1_matrix = W1*W2[[0],:].T
# a1_bias = W2[0,:]*b1
a1_matrix = W1
a1_bias = b1
# print(a1_matrix)
# print(a1_bias)
sums = 0
linear_temp=np.zeros((8,4),dtype=np.double)
bias_temp=np.zeros((8,),dtype=np.double)
sgn_w2_temp = np.zeros((8,),dtype=np.double)

linear_current = [0,0,0,0]
bias_current = 0
matrix_ij_idx = 3

for i in range(8):
#     print('W2', W2[0,i])
    sgn_w2 = int(W2[matrix_ij_idx,i]>0)*2-1
    abs_w2 = np.abs(W2[matrix_ij_idx,i])
    print('###')
    print(sgn_w2,abs_w2)
    print(a1_matrix[i,0]*abs_w2,'*a+', \
          a1_matrix[i,1]*abs_w2,'*b+', \
          a1_matrix[i,2]*abs_w2,'*c+', \
          a1_matrix[i,3]*abs_w2,'*d+', \
          b1[i]*abs_w2
         )
    temp = a1_bias[i]*abs_w2
    for j in range(4):
        temp = temp+ a1_matrix[i,j]*abs_w2*input_temp[j]
        linear_temp[i,j] += (sgn_w2*a1_matrix[i,j]*abs_w2)
    bias_temp[i] += sgn_w2*a1_bias[i]*abs_w2
    sgn_w2_temp[i]=sgn_w2
    if temp>0:
        print('Used', sgn_w2)
        for j in range(4):
            linear_current[j] += W2[matrix_ij_idx,i]*a1_matrix[i,j]
        bias_current += W2[matrix_ij_idx,i]*a1_bias[i]
        sums =sums + sgn_w2*temp
#     print('temp',temp)
    
print(linear_temp, bias_temp, sgn_w2_temp)
print(linear_current, bias_current, b2[matrix_ij_idx])
print(sums+b2[matrix_ij_idx])
#print(-2.25*input_temp[0]+1.5*input_temp[1]+1.5*input_temp[2]-1*input_temp[3])#0
#print(1.5*input_temp[0]-1.5*input_temp[1]-input_temp[2]+input_temp[3])#1
#print(1.5*input_temp[0]-input_temp[1]-1.5*input_temp[2]+input_temp[3])#2
print(-input_temp[0]+input_temp[1]+input_temp[2]-input_temp[3])#3

  0%|          | 0/79 [00:00<?, ?it/s]

tensor([-0.2866, -0.5810,  0.1632,  0.4102], dtype=torch.float64)
tensor([[-0.3902,  0.6857],
        [ 0.3151, -0.5391]], dtype=torch.float64)
###
1 3.425689942326605e-17
1.007919865831656e-33 *a+ -3.7563351262661246e-33 *b+ -3.7068848424663126e-33 *c+ -1.0154341641288724e-32 *d+ -2.276634033557734e-31
###
1 1.066248788082498
0.10196288235292301 *a+ 0.3131422284655641 *b+ 0.05667584748630891 *c+ -0.4760522495341581 *d+ 0.8517431183007103
Used 1
###
1 7.416932018537047e-18
1.896865066362981e-35 *a+ -3.791184093211186e-35 *b+ 2.740679575852533e-33 *c+ 4.1669917456849816e-33 *d+ -5.673553945872026e-32
###
-1 0.4563711064433042
0.7010058763781349 *a+ -0.5512213440173779 *b+ -0.34670044951463486 *c+ 0.29188574737143463 *d+ -0.11044606018624072
Used -1
###
1 0.37281568452403346
-0.13478239936522488 *a+ -0.017627018724781784 *b+ 0.1731467518604734 *c+ -0.12090990014699386 *d+ 0.38800151543387823
Used 1
###
-1 0.6513827684519813
0.2314007554914667 *a+ -0.12350224376568601 *b+ -0.3975320259169




In [24]:
0.5526474637133071 -0.5657411764809002

-0.01309371276759308