In [17]:
import numpy as np
import pandas as pd

In [18]:
def generate_next(result, lags, coefficients, noise):
    ans = 0.0
    for i in range(len(lags)):
        ans += result[-lags[i]] * coefficients[i] + noise * np.random.rand()
    return ans

def generate_sequence(lags, coefficients, initial, length, noise):    
    result = []
    for value in initial:
        result.append(value)
    for _ in range(length - len(initial)):
        result.append(generate_next(result, lags, coefficients, noise))
    return result

In [19]:
np.random.seed(0)
LAT_DIM = 2
LENGTH = 200
REAL_DIM = 5
NOISE_COEFFICIENT_LAT = 1e-6
NOISE_COEFFICIENT_Y = 1e-6

lags = [1, 2, 3]
coefficients = [-1.0 + 2.0 * np.cos(np.sqrt(2)), -1.0 + 2 * np.cos(np.sqrt(2)), -1.0]
                

lat_sequences = []
for i in range(LAT_DIM):
    initial = np.random.uniform(low = -1.0, high = 1.0, size = len(lags))
    sequence = generate_sequence(lags, coefficients, initial, LENGTH, NOISE_COEFFICIENT_LAT)
    lat_sequences.append(sequence)

lat_sequences = np.asarray(lat_sequences) 



F = np.random.rand(LAT_DIM, REAL_DIM)
Y = lat_sequences.T.dot(F).T

Y = Y + NOISE_COEFFICIENT_Y * np.random.rand(*Y.shape)
dic = {}
for i in range(REAL_DIM):
    dic[i] = Y[i, :]
dic['timestamps'] = np.arange(Y.shape[1])

dataset = pd.DataFrame(dic)
columns_except_timestamps = [column for column in dataset.columns if column != 'timestamps']
dataset = dataset[['timestamps'] + columns_except_timestamps]
print(dataset.shape)
dataset.head()

(200, 6)


Unnamed: 0,timestamps,0,1,2,3,4
0,0,-0.053928,-0.063949,-0.139517,-0.172362,-0.085158
1,1,0.767399,0.751425,0.909421,1.057764,0.404478
2,2,0.088875,0.073456,0.013269,0.003153,-0.02223
3,3,-0.535282,-0.503658,-0.495393,-0.557664,-0.177869
4,4,-0.460217,-0.455394,-0.577661,-0.676193,-0.266785


In [20]:
import math
import os
def launch(dir_for_tmp_files, dataset, id, num_to_test = 50, steps = 100, verbose = False, eval = False,\
          eta = 1, lambdaX = 1, lambdaW = 1, lambdaF = 1, standardize = True, lat_dim = 2,\
           lags = [1, 5, 10, 20, 25, 100]):
    
    name = dir_for_tmp_files + '/' + 'current_launch' + str(id) + '.csv'
    dataset.to_csv(name, sep = ';', index = False)
    if 'timestamps' in dataset.columns:
        timestamps = dataset['timestamps'].as_matrix()
    else:
        timestamps = dataset['timestamp'].as_matrix()
    
    #train_start = math.ceil(timestamps[1])
    #train_end = math.floor(timestamps[timestamps.shape[0] -num_to_test - 1])
    #test_start = math.ceil(timestamps[timestamps.shape[0] -num_to_test])
    #test_end = math.floor(timestamps[-2])
    
    #train_start = 1513357140
    train_start = math.ceil(timestamps[0])
    #train_end = 1517653140
    train_end =  math.floor(timestamps[-num_to_test])
    #test_start = 1517653260
    test_start = math.ceil(timestamps[-num_to_test])
    test_end = math.floor(timestamps[-1])
    command = "./main --dataset_path " + name + " --separator ';'"
    command += ' --train_start ' + str(train_start)
    command += ' --train_end ' + str(train_end)
    command += ' --test_start ' + str(test_start)
    command += ' --test_end ' + str(test_end)
    command += ' --steps ' + str(steps)
    
    if verbose:
        command += ' --verbose 1'
    else:
        command += ' --verbose 0'
    
    if eval:
        command += ' --eval 1'
    else:
        command += ' --eval 0'
    
    command += ' --predictions_out ' + dir_for_tmp_files + '/predictions_' + str(id) + '.csv'
    command += ' --eta ' + str(eta)
    command += ' --lambdaX ' + str(lambdaX)
    command += ' --lambdaW ' + str(lambdaW)
    command += ' --lambdaF ' + str(lambdaF)
    
    if standardize:
        command += ' --standardize 1'
    else:
        command += ' --standardize 0'
    
    command += ' --lags '
    for lag in lags:
        command += str(lag) + ' '
    
    command += ' --lat_dim ' + str(lat_dim)
    command += ' --factor_out ' + dir_for_tmp_files + '/factor_out_' + str(id)
    logs_file_name = 'logs_file_' + str(id)
    command += ' --logs_file ' + dir_for_tmp_files + '/' + logs_file_name
    #print(command)
    os.system(command)
    
    with open(dir_for_tmp_files + '/' + logs_file_name, 'r') as f:
        logs = [line for line in f]   
    
    return logs

In [23]:
logs = launch('tmp', dataset, 1, num_to_test = 10, steps = 100, verbose = True, eval = True,\
          eta = 0.01, lambdaX = 1, lambdaW = 1e-8, lambdaF = 1e-8, standardize = True, lat_dim = 2,\
           lags = [1, 2, 3])

In [24]:
print("real coefficients: ")
print(np.asarray(coefficients))
W_restored = pd.read_csv('tmp/factor_out_1_W.csv', header = None)
print("restored coefficients: ")
print(W_restored.as_matrix().T)

real coefficients: 
[-0.68811261 -0.68811261 -1.        ]
restored coefficients: 
[[-0.68781104 -0.68794362 -0.99966855]
 [-0.68712377 -0.68815798 -0.99898182]]


In [29]:
print("real F: ")
print(np.asarray(F))
F_restored = pd.read_csv('tmp/factor_out_1_F.csv')
print("restored F: ")
print(F_restored.as_matrix())

real F: 
[[0.92797617 0.86686091 0.81615075 0.91145088 0.27633715]
 [0.36952354 0.3798939  0.56045059 0.66821823 0.28671668]]
restored F: 
[[-5.55674486 -4.00195229  3.60816067  4.72042094 10.39494811]
 [ 7.9743076   7.56093467  5.47900965  5.16657706  3.53981488]]
