In [1]:
import numpy as np
import pandas as pd
import torch
import torchvision
from matplotlib import pyplot as plt

In [None]:
# train_data: (n, 2), n = row of train.csv
train_data = pd.read_csv("../input/train.csv", dtype={
    "acoustic_data": np.float32, "time_to_failure": np.float32
}).values

In [None]:
# start time for each earthquake, that is, the external last time for each previous earthquake (except the last one)
start_indexs = np.nonzero(np.diff(train_data[:, 1]) > 0)[0] + 1
start_indexs = np.insert(start_indexs, 0, 0)

# normalize each earthquake
for i in range(len(start_indexs) - 1):
    temp_data = train_data[start_indexs[i]:start_indexs[i+1], 0]
    temp_data = (temp_data - temp_data.mean()) / temp_data.std()
    train_data[start_indexs[i]:start_indexs[i+1], 0] = temp_data

In [None]:
# return: (n_steps, n), n = row of features
def features_12(data, n_steps=150, step_length=1000):
    data_10  = data[:, -(step_length // 10):]
    data_100 = data[:, -(step_length // 100):]
    
    return np.c_[
        data.mean(axis=1),
        data.std(axis=1),
        data.min(axis=1),
        data.max(axis=1),
        
        data_10.mean(axis=1),
        data_10.std(axis=1),
        data_10.min(axis=1),
        data_10.max(axis=1),
        
        data_100.mean(axis=1),
        data_100.std(axis=1),
        data_100.min(axis=1),
        data_100.max(axis=1),
    ]

In [None]:
# raw_data: (n, 1), n = n_steps * step_length
def to_features(raw_data, last_index=None, n_steps=150, step_length=1000):
    if last_index == None:
        last_index = len(raw_data)
        
    data = raw_data[(last_index - n_steps * step_length):last_index]
    data = data.reshape(n_steps, step_length)
    
    return features_12(data, n_steps, step_length)

In [None]:
# raw_data: (n, 2), n = max_index - min_index
def random_generator(raw_data, min_index=0, max_index=None, batch_size=32, n_steps=150, step_length=1000):
    if max_index == None:
        max_index = len(raw_data)
        
    while True:
        last_indexs = np.random.randint(min_index + n_steps * step_length, max_index, size=batch_size)
                
        samples = np.zeros((batch_size, n_steps, step_length))
        targets = np.zeros(batch_size)
        
        for i, last_index in enumerate(last_indexs):
            samples[i] = to_features(raw_data[:, 0], last_index, n_steps, step_length)
            targets[i] = raw_data[last_index - 1, 1]
        yield samples, targets

In [None]:
# raw_data: (n, 2), n = max_index - min_index
class RandomDataset(torch.utils.data.Dataset):
    def __init__(self, raw_data, min_index=0, max_index=None, n_steps=150, step_length=1000, transform=None, dataset_size=65536):
        if max_index == None:
            max_index = len(raw_data)
        if transform == None:
            transform = torchvision.transforms.Compose([
                torchvision.transforms.ToTensor()
            ])
        
        self.data = raw_data
        self.n_steps = n_steps
        self.step_length = step_length
        self.transform = transform
        self.last_indexs = np.random.randint(min_index + n_steps * step_length, max_index, size=dataset_size)
        
    def __getitem__(self, index):
        sample = to_features(self.data[:, 0], self.last_indexs[index], self.n_steps, self.step_length)
        target = self.data[self.last_indexs[index] - 1, 1]
        return self.transform(sample), target
    
    def __len__(self):
        return len(self.last_indexs)

In [None]:
# parameters
batch_size = 32
n_epochs = 30
learning_rate = 5e-4

In [None]:
train_dataset = RandomDataset(train_data, dataset_size=65536)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=False
)

test_dataset = RandomDataset(train_data, min_index=0, max_index=start_indexs[1], dataset_size=4096)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False
)

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size=48):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.rnn = torch.nn.RNN(input_size, hidden_size)
        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1),
        )
    
    def forward(self, input):
        output, _ = self.rnn(input)
        output = output.view(output.size(0), -1)
        output = self.final_layers(output)
        return output
        

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size=48):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.rnn = torch.nn.LSTM(input_size, hidden_size)
        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1),
        )
    
    def forward(self, input):
        output, _ = self.rnn(input)
        output = output.view(output.size(0), -1)
        output = self.final_layers(output)
        return output
        

In [None]:
class GRU(torch.nn.Module):
    def __init__(self, input_size, hidden_size=48):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.rnn = torch.nn.GRU(input_size, hidden_size)
        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1),
        )
    
    def forward(self, input):
        output, _ = self.rnn(input)
        output = output.view(output.size(0), -1)
        output = self.final_layers(output)
        return output
        

In [None]:
device = torch.device("cuda:0")

model = GRU(12)  # row of features
model.to(device)

lose_fn = torch.nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

In [None]:
def train(data_loader, model, loss_fn, optimizer, device):
    model.train()
    
    total_loss = 0
    for i, (sample, target) in enumerate(data_loader):
        sample, target = sample.to(device), target.to(device)
        optimizier.zero_grad()
        output = model(sample)
        loss = loss_fn(output, target)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    return total_loss / len(data_loader)

In [None]:
def evaluate(data_loader, model, loss_fn, device):
    with torch.no_grad():
        model.eval()
        
        correct = 0
        total_loss = 0
        for i, (sample, target) in enumerate(data_loader):
            sample, target = sample.to(device), target.to(device)
            output = model(sample)
            loss = loss_fn(output, target)
            total_loss += loss.item()
            
    return total_loss / len(data_loader)

In [None]:
def fit(train_loader, test_loader, model, loss_fn, optimizer, n_epochs, device):
    train_losses = []
    test_losses = []
    
    for epoch in range(n_epochs):
        train_loss = train(train_loader, model, loss_fn, optimizer, device)
        test_loss = evaluate(test_loader, model, loss_fn, device)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        show_curve(train_losses, "train_losses")
        show_curve(test_losses, "test_losses")

In [None]:
def show_curve(ys, title):
    x = np.array(range(len(ys)))
    y = np.array(ys)
    plt.plot(x, y, c='b')
    plt.axis()
    plt.title('{} Curve:'.format(title))
    plt.xlabel('Epoch')
    plt.ylabel('{} Value'.format(title))
    plt.show()

In [None]:
fit(train_loader, test_loader, model, loss_fn, optimizer, n_epochs, device)

In [None]:
def solve(model):
    submission = pd.read_csv("../input/sample_submission.csv", index_col="seg_id", dtype={ "time_to_failure": np.float32 })   
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor()
    ])
    
    for i, seg_id in enumerate(submission.index):
        seg = pd.read_csv("../input/test/" + seg_id + ".csv", dtype={ "acoustic_data": np.float32 }).values
        seg = seg[:, 0]
        
        # normalize
        seg = (seg - seg.mean()) / seg.std()
        
        seg = to_features(seg)
        seg = transform(seg)
        seg = seg.to(device)
        output = model(seg)
        submission['time_to_failure'][i] = output
    
    submission.to_csv('submission.csv')

In [None]:
solve(model)