In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import random

#from PIL import Image
from skimage import io, transform
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets
from torch.autograd import Variable
from cnn_finetune import make_model
import tqdm

In [None]:
data_dir = os.getcwd()

In [None]:
class PainterDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform):
        self.img_dir = img_dir
        self.transform = transform
        #self.df = pd.read_csv(csv_file, usecols=['filename','artist']) #isolate painter and filename
        self.df = pd.read_csv(csv_file, usecols=['file1','file2', 'label'])
        #self.groups = self.df.groupby(['artist'])
        
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, idx):
        os.chdir(self.img_dir)
        row = self.df.iloc[[idx]]
        filename_1 = row['file1'].item()
        filename_2 = row['file2'].item()
        label = row['label'].item()
        
        img_1 = io.imread(filename_1)
        img_2 = io.imread(filename_2)
        
        if self.transform is not None:
            img_1 = self.transform(img_1)
            img_2 = self.transform(img_2)
        
        label = torch.tensor([label]) #1 if same, 0 if not
        
        return img_1, img_2, label

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, cnn):
        super(SiameseNetwork, self).__init__()
        self.cnn = cnn
        self.fc = nn.Sequential(
                  nn.Linear(in_features = 65536, out_features = 4096, bias = True),
                  nn.ReLU (inplace = True),
                  nn.Dropout(p=0.5),
                  nn.Linear(in_features = 4096, out_features = 4096,  bias = True),
                  nn.ReLU (inplace = True),
                  nn.Dropout(p=0.5),
                  nn.Linear(in_features = 4096, out_features = 128, bias = True),
                  nn.ReLU (inplace = True),
                  nn.Dropout(p=0.5),
                  nn.Linear(in_features = 128, out_features = 1, bias = True)
                               )

    def forward_once(self, x):
        output = self.cnn(x)
        output = output.view(output.size()[0], -1)
        return output

    def forward(self, img1, img2):
        output1 = self.forward_once(img1)
        output2 = self.forward_once(img2)
        features = torch.cat((output1, output2),1) # dimension: 3072 
        features = self.fc(features)
        return torch.sigmoid(features)

In [None]:
class Args(object):
    def __init__(self, batch_size= 32 , test_batch_size= 16 ,
            epochs=5, lr=0.01, momentum=0.5,
            log_interval=100,seed=1):
        self.batch_size = batch_size
        self.test_batch_size = test_batch_size # Input batch size for testing
        self.epochs = epochs # Number of epochs to train
        self.lr = lr # Learning rate
        self.momentum = momentum 
        self.log_interval = log_interval # Batches to wait before logging
                                     # detailed status. 0 = never
        self.seed = seed # Random seed

In [None]:
def prepare_dataset(object):
   
    #train_csv = os.path.join(data_dir,'train1_pair.csv')    # adjust it to work on different folder
    train_csv = os.path.join(data_dir,'train_120k_pair.csv')
    train_dir = os.path.join(data_dir,'train_reg')
    test_csv = os.path.join(data_dir,'test_pair.csv')
    test_dir = os.path.join(data_dir,'test_reg')
    
    kwargs = {'num_workers': 1, 'pin_memory': True} 
    
    transform1 = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    train_dataset = PainterDataset(csv_file = train_csv, img_dir = train_dir, transform = transform1)
    test_dataset = PainterDataset(csv_file = test_csv, img_dir = test_dir, transform = transform1)
    
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs)
    
    return train_loader, test_loader

In [None]:
def train(model, optimizer, train_loader, epoch, total_minibatch_count, train_losses, train_accs):
  
    #training
    model.train()
    loss = nn.BCELoss()
    correct_count, total_loss, total_acc = 0., 0., 0.
    progress_bar = tqdm.tqdm(train_loader, desc='Training')
    
    for batch_idx, (input1, input2, target) in enumerate(progress_bar):
        input1, input2, target = input1.cuda(), input2.cuda(), target.cuda()
        input1, input2, target = Variable(input1), Variable(input2),Variable(target)
        
        optimizer.zero_grad()

        # Forward prediction step
        output = model(input1, input2)
        train_loss = loss(output,target.float())
        
        # Backpropagation step
        train_loss.backward()
        optimizer.step()

        pred = (output > 0.5)

        matches = (target.float() == pred.float())
        accuracy = matches.float().mean()
        correct_count += matches.sum()

        if args.log_interval != 0 and \
                total_minibatch_count % args.log_interval == 0:

            train_losses.append(train_loss.item())
            train_accs.append(accuracy.data[0])
            
        total_loss += train_loss.data
        total_acc += accuracy.data
            
        progress_bar.set_description(
            'Epoch: {} loss: {:.4f}, acc: {:.2f}'.format(
                epoch, total_loss / (batch_idx + 1), total_acc / (batch_idx + 1)))
        #progress_bar.refresh()

        total_minibatch_count += 1

    return total_minibatch_count

In [None]:
def test(model, test_loader):
         #, epoch, total_minibatch_count,val_losses, val_accs):
    
    # testing
    model.eval()
    loss = nn.BCELoss()
    test_loss, correct = 0., 0.
    progress_bar = tqdm.tqdm(test_loader, desc='Validation')
    with torch.no_grad():
        for input1, input2, target in progress_bar:
            input1, input2, target = input1.cuda(), input2.cuda(), target.cuda()
            input1, input2, target = Variable(input1), Variable(input2), Variable(target)
            
            output = model(input1, input2)
            test_loss += loss(output,target.float())
            pred =  (output > 0.5)
            correct += torch.sum(target.float() == pred.float())

    test_loss /= len(test_loader.dataset)
    
    acc = correct / len(test_loader.dataset)

    #val_losses.append(test_loss)
    #val_accs.append(acc)
    
    progress_bar.clear()
    progress_bar.write(
        'validation test results - Average val_loss: {:.4f}, val_acc: {}/{} ({:.2f}%)'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

    return acc

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x
      
def run_experiment(args):

    total_minibatch_count = 0
    torch.cuda.manual_seed(args.seed)

    '''
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    '''
    
    train_loader, test_loader, = prepare_dataset(args)
    epochs_to_run = args.epochs
    cnn = models.vgg16(pretrained=True).features.eval() 
    #cnn = models.resnet18(pretrained=True).eval() 
    #cnn.fc = Identity()
    #cnn = make_model('inceptionresnetv2', num_classes=2, pretrained=True)
    #cnn._classifier = Identity()
    for param in cnn.parameters():
        param.requires_grad = False
    model = SiameseNetwork(cnn).cuda()
    #model = Siamese().cuda()
    optimizer = optim.SGD(model.parameters(), lr = args.lr, momentum = args.momentum)
    #optimizer = optim.Adam(model.parameters())
    val_acc = 0
    train_losses, train_accs = [], []
    #val_losses, val_accs = [], []

    for epoch in range(1, epochs_to_run + 1):
        
        total_minibatch_count = train(model, optimizer, train_loader, epoch, total_minibatch_count, train_losses, train_accs)
        test(model, test_loader)
    
    return model

In [None]:
args = Args()
model = run_experiment(args)

In [None]:
torch.save(model, os.path.join('vgg.ph'))