# Convert NASA data to Orion format

In this notebook we download the data from the Telemanom S3 bucket and reformat it
as Orion pipelines expect.

## Download the data

In [9]:
import io
import os
import urllib
import zipfile

DATA_URL = 'https://s3-us-west-2.amazonaws.com/telemanom/data.zip'

if not os.path.exists('data'):
    response = urllib.request.urlopen(DATA_URL)
    bytes_io = io.BytesIO(response.read())
    
    with zipfile.ZipFile(bytes_io) as zf:
        zf.extractall()

In [10]:
train_signals = os.listdir('data/train')
test_signals = os.listdir('data/test')

In [11]:
train_signals == test_signals

True

## Convert the NPY matrices to CSVs

We convert the NPY matrices to CSV files with two columns: `timestamp` and `value`.

For this, what we do is loading both the train and test matrices for each signals
and concantenate them to generate a single matrix for each signal.

Afterwards, we add a timestamp column by taking the value 1222819200 (2008-10-01T00:00:00)
as for the first row and then increasing the timestamp by 21600 seconds (6h) for each other row.

In [12]:
import pandas as pd
import numpy as np

In [13]:
def build_df(data, start=0):
    index = np.array(range(start, start + len(data)))
    timestamp = index * 21600 + 1222819200
    
    return pd.DataFrame({'timestamp': timestamp, 'value': data[:, 0]})

data = build_df(np.load('data/train/S-1.npy'))

In [20]:
print(np.load('data/train/S-1.npy'))
print(np.load('data/train/S-1.npy')[:,0].shape)

[[-0.36635895  0.          0.         ...  0.          0.
   0.        ]
 [-0.39410778  0.          0.         ...  0.          0.
   0.        ]
 [ 0.4036246   0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.34135706  0.          0.         ...  0.          0.
   0.        ]
 [-0.39254644  0.          0.         ...  0.          0.
   0.        ]
 [ 1.          0.          0.         ...  0.          0.
   0.        ]]
(2818,)


In [23]:
head, *tail = np.load('data/train/S-1.npy')
print(tail)

[array([-0.39410778,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ]), array([0.4036246, 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       ]), array([-0.36275906,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0. 

In [14]:
data.head()

Unnamed: 0,timestamp,value
0,1222819200,-0.366359
1,1222840800,-0.394108
2,1222862400,0.403625
3,1222884000,-0.362759
4,1222905600,-0.370746


## Store the results as CSV

In [7]:
os.makedirs('csv', exist_ok=True)

In [8]:
for signal in train_signals:
    name = signal[:-4]
    train_np = np.load('data/train/' + signal)
    test_np = np.load('data/test/' + signal)
    
    data = build_df(np.concatenate([train_np, test_np]))
    data.to_csv('csv/' + name + '.csv', index=False)
    
    train = build_df(train_np)
    train.to_csv('csv/' + name + '-train.csv', index=False)
    
    test = build_df(test_np, start=len(train))
    test.to_csv('csv/' + name + '-test.csv', index=False)

In [9]:
s1 = pd.read_csv('csv/S-1.csv')

In [10]:
s1.head()

Unnamed: 0,timestamp,value
0,1222819200,-0.366359
1,1222840800,-0.394108
2,1222862400,0.403625
3,1222884000,-0.362759
4,1222905600,-0.370746


In [11]:
s1.shape

(10149, 2)

In [12]:
s1_train = pd.read_csv('csv/S-1-train.csv')

In [13]:
s1_train.head()

Unnamed: 0,timestamp,value
0,1222819200,-0.366359
1,1222840800,-0.394108
2,1222862400,0.403625
3,1222884000,-0.362759
4,1222905600,-0.370746


In [14]:
s1_train.tail()

Unnamed: 0,timestamp,value
2813,1283580000,-0.365308
2814,1283601600,1.0
2815,1283623200,-0.341357
2816,1283644800,-0.392546
2817,1283666400,1.0


In [15]:
s1_train.shape

(2818, 2)

In [16]:
s1_test = pd.read_csv('csv/S-1-test.csv')

In [17]:
s1_test.head()

Unnamed: 0,timestamp,value
0,1283688000,0.028893
1,1283709600,0.40304
2,1283731200,-0.366585
3,1283752800,-0.36971
4,1283774400,-0.386455


In [18]:
s1_test.shape

(7331, 2)