In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]='13'

In [2]:
random_seed = 42

In [3]:
import pickle

In [4]:
import random
random.seed(random_seed)
import numpy as np
np.random.seed(random_seed)
import pandas as pd
pd.set_option('max_colwidth', 256)

In [5]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats

%matplotlib inline
set_matplotlib_formats('svg')

In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [7]:
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [8]:
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF

from torchvision import models

In [9]:
input_size = 224

# transform_train = transforms.Compose([
#         transforms.RandomResizedCrop(input_size),
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

transform_test = transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [10]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform_test)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=False, num_workers=32)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=32)

In [11]:
net = models.resnet50(pretrained=True)
net = nn.Sequential(*list(net.children())[:-2])

net.cuda()
net.eval()

print('done')

done


In [12]:
from tqdm import tqdm

In [13]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train_labels_list = []

with torch.no_grad():
    for idx, data in tqdm(enumerate(trainloader), total=len(trainloader)):
        inputs, labels = data
            
        inputs = inputs.cuda()        
        labels = labels.cuda()
        
        with torch.no_grad():
            features = net(inputs) # bs 512 7 7
        with open('./data/cifar_10/train/{}.npy'.format(str(idx)), 'wb') as f:
            np.save(f, features.detach().cpu().numpy())
                    
        train_labels_list.append(labels)

    train_labels = torch.cat(train_labels_list, dim=0).detach().cpu().numpy() # 5000 1

100%|██████████| 50000/50000 [10:17<00:00, 80.93it/s]


In [23]:
data = []
for i in range(len(train_labels)):
    data.append([i, train_labels[i]])

df_0 = pd.DataFrame(data, columns=['cifar_index','label'])
df_0.head()

Unnamed: 0,cifar_index,label
0,0,6
1,1,9
2,2,9
3,3,4
4,4,1


In [24]:
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
cifar_index    50000 non-null int64
label          50000 non-null int64
dtypes: int64(2)
memory usage: 781.3 KB


In [25]:
df_0.to_csv('data/train.csv', index=False)

In [11]:
df_0 = pd.read_csv('data/train.csv')
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
cifar_index    50000 non-null int64
label          50000 non-null int64
dtypes: int64(2)
memory usage: 781.3 KB


In [12]:
from sklearn.model_selection import train_test_split
_, df_0_sampled = train_test_split(df_0, test_size=0.3, 
                                   random_state=random_seed, 
                                   stratify=df_0['label'])

In [13]:
df_0_sampled.head()

Unnamed: 0,cifar_index,label
39264,39264,9
18869,18869,7
11313,11313,2
26085,26085,6
22976,22976,8


In [14]:
df_0_sampled['label'].value_counts()

7    1500
6    1500
5    1500
4    1500
3    1500
2    1500
9    1500
1    1500
8    1500
0    1500
Name: label, dtype: int64

In [15]:
df_0_sampled_train, df_0_sampled_dev = train_test_split(df_0_sampled, test_size=5000, 
                                                        random_state=random_seed, 
                                                        stratify=df_0_sampled['label'])

In [16]:
df_0_sampled_train.to_csv('data/train_10000.csv', index=False)

In [17]:
df_0_sampled_dev.to_csv('data/dev_5000.csv', index=False)

In [33]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

test_labels_list = []

with torch.no_grad():
    for idx, data in tqdm(enumerate(testloader), total=len(testloader)):
        inputs, labels = data
            
        inputs = inputs.cuda()        
        labels = labels.cuda()
        
        with torch.no_grad():
            features = net(inputs) # bs 512 7 7
        with open('./data/cifar_10/test/{}.npy'.format(str(idx)), 'wb') as f:
            np.save(f, features.detach().cpu().numpy())
                    
        test_labels_list.append(labels)

    test_labels = torch.cat(test_labels_list, dim=0).detach().cpu().numpy() # 5000 1

100%|██████████| 10000/10000 [02:08<00:00, 77.78it/s]


In [34]:
data = []
for i in range(len(test_labels)):
    data.append([i, test_labels[i]])

df_1 = pd.DataFrame(data, columns=['cifar_index','label'])
df_1.head()

Unnamed: 0,cifar_index,label
0,0,3
1,1,8
2,2,8
3,3,0
4,4,6


In [35]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
cifar_index    10000 non-null int64
label          10000 non-null int64
dtypes: int64(2)
memory usage: 156.3 KB


In [36]:
df_1.to_csv('data/test.csv', index=False)

In [37]:
with open('./data/cifar_10/test/0.npy', 'rb') as f:
    tmp = np.load(f)
tmp.shape

(1, 2048, 7, 7)

In [18]:
total = len(df_0_sampled_train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = df_0_sampled_train.sample(k, 
                       random_state=0
                      )

    tmp = df_0_sampled_train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random_0/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['cifar_index', 'label']].to_csv(filename, index=False)

0 0
7    1000
6    1000
5    1000
4    1000
3    1000
2    1000
9    1000
1    1000
8    1000
0    1000
Name: label, dtype: int64
10 1000
8    919
7    904
6    904
2    903
0    902
5    898
3    898
9    893
1    890
4    889
Name: label, dtype: int64
20 2000
8    826
6    809
5    803
7    802
2    802
3    797
1    796
0    796
9    792
4    777
Name: label, dtype: int64
30 3000
6    719
5    713
8    711
9    703
3    700
7    699
2    694
1    692
0    691
4    678
Name: label, dtype: int64
40 4000
3    614
6    612
7    611
5    604
1    604
8    600
0    596
2    592
9    590
4    577
Name: label, dtype: int64
50 5000
3    526
5    510
6    508
7    505
8    505
0    499
1    495
9    492
2    485
4    475
Name: label, dtype: int64
60 6000
3    424
5    416
8    409
6    407
0    404
1    397
7    395
2    393
9    381
4    374
Name: label, dtype: int64
70 7000
3    325
5    323
1    312
8    308
6    306
0    297
2    291
9    288
7    281
4    269
Name: label, dtype: int64
80

In [19]:
total = len(df_0_sampled_train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = df_0_sampled_train.sample(k, 
                       random_state=2
                      )

    tmp = df_0_sampled_train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random_2/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['cifar_index', 'label']].to_csv(filename, index=False)

0 0
7    1000
6    1000
5    1000
4    1000
3    1000
2    1000
9    1000
1    1000
8    1000
0    1000
Name: label, dtype: int64
10 1000
7    908
9    906
0    906
5    905
1    902
8    901
2    896
4    895
3    895
6    886
Name: label, dtype: int64
20 2000
5    821
0    809
4    807
2    804
7    803
9    803
3    794
8    790
6    786
1    783
Name: label, dtype: int64
30 3000
2    714
4    714
7    710
5    710
3    700
0    697
8    694
9    693
6    689
1    679
Name: label, dtype: int64
40 4000
2    618
4    618
7    615
5    611
9    603
0    592
8    589
6    588
3    585
1    581
Name: label, dtype: int64
50 5000
7    537
4    522
5    520
2    507
9    507
3    496
6    480
1    480
0    478
8    473
Name: label, dtype: int64
60 6000
7    441
5    417
4    416
2    416
9    407
3    404
0    382
6    378
1    371
8    368
Name: label, dtype: int64
70 7000
5    327
7    318
4    315
3    312
2    311
9    303
8    282
0    279
6    278
1    275
Name: label, dtype: int64
80

In [20]:
total = len(df_0_sampled_train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = df_0_sampled_train.sample(k, 
                       random_state=42
                      )

    tmp = df_0_sampled_train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['cifar_index', 'label']].to_csv(filename, index=False)

0 0
7    1000
6    1000
5    1000
4    1000
3    1000
2    1000
9    1000
1    1000
8    1000
0    1000
Name: label, dtype: int64
10 1000
5    911
1    911
3    909
7    906
2    903
6    897
9    894
4    893
0    889
8    887
Name: label, dtype: int64
20 2000
7    815
3    814
6    808
5    804
0    799
1    798
9    794
8    792
4    792
2    784
Name: label, dtype: int64
30 3000
6    728
7    715
3    713
5    703
1    703
4    694
8    691
2    690
0    684
9    679
Name: label, dtype: int64
40 4000
7    627
6    615
3    606
8    602
2    601
4    595
5    593
1    591
9    585
0    585
Name: label, dtype: int64
50 5000
2    521
7    518
6    507
3    503
8    499
1    497
4    495
5    492
0    490
9    478
Name: label, dtype: int64
60 6000
7    425
8    410
6    407
2    405
0    404
4    399
5    397
1    394
3    389
9    370
Name: label, dtype: int64
70 7000
7    318
2    314
0    312
8    311
4    307
6    300
1    296
5    285
3    284
9    273
Name: label, dtype: int64
80