In [None]:
% run 1-datasource.ipynb

In [None]:
import torch.utils.data as data_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from tensorboardX import SummaryWriter

In [None]:
def tensorify(data):
    return torch.from_numpy(data.reshape(-1, 1, 75, 75)).float()

In [None]:
icebergs.head()

In [None]:
train, test = train_test_split(icebergs, test_size=0.1, stratify=icebergs.is_iceberg)
test.shape

In [None]:
scaler_1 = StandardScaler()
scaler_2 = StandardScaler()

In [None]:
for _, i in train.iterrows():
    scaler_1.partial_fit(i.band_1.reshape(1, -1))
    scaler_2.partial_fit(i.band_2.reshape(1, -1))

In [None]:
train_x = torch.from_numpy(
    np.stack(
        [
            scaler_1.transform(np.stack(train.band_1)),
            scaler_2.transform(np.stack(train.band_2)),
        ],
        axis=1
    ).reshape(-1, 2, 75, 75)
).float()
test_x = torch.from_numpy(
    np.stack(
        [
            scaler_1.transform(np.stack(test.band_1)),
            scaler_2.transform(np.stack(test.band_2)),
        ],
        axis=1
    ).reshape(-1, 2, 75, 75)
).float()
train_y = torch.from_numpy(train.is_iceberg.values.reshape(-1, 1)).float()
test_y = torch.from_numpy(test.is_iceberg.values.reshape(-1, 1)).float()

In [None]:
cuda = torch.cuda.is_available()
num_epochs = 1
batch_size = 25
learning_rate = 0.00001
cuda

In [None]:
train_loader = data_utils.DataLoader(data_utils.TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=True)
test_loader = data_utils.DataLoader(data_utils.TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=True)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(2, 16, kernel_size=5, stride=1, padding=2, groups=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Sequential(
            nn.Linear(32 * 18 * 18, 600),
            nn.ReLU(),
            nn.Linear(600, 200),
            nn.ReLU(),
            nn.Linear(200, 1),
        )

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

In [None]:
net = Net()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
writer = SummaryWriter()
writer.add_graph(net, net(Variable(train_x.narrow(0, 0, 1))))

if cuda:
    net.cuda()
    criterion.cuda()
    
epoch_train_loss = []
epoch_test_loss = []

In [None]:
def loop(loader, training=False):
    
    running_loss = 0
    targets = []
    predictions = []
    
    for batch_id, (x, y) in enumerate(loader):
        x = Variable(x)
        y = Variable(y).squeeze()
        
        if cuda:
            x = x.cuda()
            y = y.cuda()
        
        optimizer.zero_grad()
        
        outputs = net(x)
        loss = criterion(outputs, y.view(-1, 1))

        if training:
            loss.backward()
            optimizer.step()

        running_loss += loss.data.cpu()[0]
        targets.extend(y.data.cpu().numpy())
        predictions.extend(outputs.sigmoid().data.cpu().numpy())
        
    return np.array(targets), np.array(predictions), running_loss

In [None]:
for e in range(30):

    train_targets, train_preds, train_loss = loop(train_loader, training=True)
    writer.add_scalar('data/train_loss', train_loss, e)
    writer.add_scalar('data/train_accuracy', accuracy_score(train_targets, train_preds.argmax(axis=1)), e)
      
    test_targets, test_preds, test_loss = loop(test_loader, training=False)
    writer.add_scalar('data/test_loss', test_loss, e)
    writer.add_scalar('data/test_accuracy', accuracy_score(test_targets, test_preds.argmax(axis=1)), e)

    train_loss /= train.shape[0]
    test_loss /= test.shape[0]
    
    epoch_train_loss.append(train_loss)
    epoch_test_loss.append(test_loss)
    
    print('Training loss: {:.4f}'.format(train_loss))
    print('Testing  loss: {:.4f}'.format(test_loss))

In [None]:
plt.plot(epoch_train_loss, label='train')
plt.plot(epoch_test_loss, label='test')
plt.legend();

In [None]:
train_targets, train_preds, train_loss = loop(train_loader)
accuracy_score(train_targets, train_preds.round())

In [None]:
log_loss(train_targets, train_preds)

In [None]:
test_targets, test_preds, test_loss = loop(test_loader)
accuracy_score(test_targets, test_preds.round())

In [None]:
log_loss(test_targets, test_preds)

In [None]:
icebergs_test = load_test_set()

In [None]:
icebergs_test.head()

In [None]:
def predict_testset(icebergs_test):
    scaler_1 = StandardScaler()
    scaler_2 = StandardScaler()
    
    for _, i in icebergs_test.iterrows():
        scaler_1.partial_fit(i.band_1.reshape(1, -1))
        scaler_2.partial_fit(i.band_2.reshape(1, -1))
    
    testset_x = torch.from_numpy(
        np.stack(
            [
                scaler_1.transform(np.stack(icebergs_test.band_1)),
                scaler_2.transform(np.stack(icebergs_test.band_2)),
            ],
            axis=1
        ).reshape(-1, 2, 75, 75)
    ).float()

    testset_y = torch.from_numpy(np.zeros((len(testset_x), 1))).float()

    testset_loader = data_utils.DataLoader(data_utils.TensorDataset(testset_x, testset_y), batch_size=batch_size, shuffle=False)
    
    targets, preds, loss = loop(testset_loader, training=False)
    
    icebergs_test['is_iceberg'] = preds
    
    icebergs_test[['is_iceberg']].to_csv('data/submission.csv')
    
    return targets, preds, loss

In [None]:
t, p ,l = predict_testset(icebergs_test)

In [None]:
icebergs_test.head()