In [5]:
import autograd.numpy as np
from autograd import grad
import pandas as pd
import sys
sys.path.append("../code/")
from basic import*
%matplotlib inline

### load the dataset

In [2]:
d = np.load("../data/data-2013-2017-missing.npy").item()
use_homes = np.intersect1d(list(d[2016].keys()), list(d[2015].keys()))

In [3]:
use_homes

array([  26,   59,   77,   86,   93,   94,  101,  114,  187,  434,  484,
        499,  503,  545,  624,  744,  781,  871,  946,  974, 1086, 1103,
       1169, 1192, 1202, 1283, 1403, 1415, 1463, 1500, 1507, 1589, 1617,
       1632, 1642, 1697, 1700, 1718, 1790, 1800, 1953, 2018, 2094, 2129,
       2156, 2171, 2199, 2233, 2365, 2378, 2472, 2532, 2557, 2575, 2638,
       2755, 2787, 2814, 2818, 2829, 2859, 2945, 2953, 2965, 3009, 3039,
       3044, 3134, 3268, 3310, 3367, 3392, 3456, 3482, 3500, 3527, 3538,
       3577, 3649, 3778, 3829, 3831, 3849, 3916, 3918, 3935, 3967, 4031,
       4154, 4213, 4220, 4297, 4298, 4336, 4342, 4352, 4357, 4373, 4375,
       4473, 4514, 4641, 4767, 4874, 4946, 4957, 4998, 5026, 5060, 5129,
       5218, 5275, 5317, 5357, 5403, 5545, 5568, 5677, 5718, 5785, 5809,
       5814, 5874, 5889, 5921, 5949, 5959, 5972, 6061, 6063, 6121, 6139,
       6165, 6348, 6412, 6423, 6460, 6498, 6691, 6692, 6730, 6990, 7016,
       7030, 7361, 7390, 7429, 7504, 7536, 7560, 76

In [6]:
APPLIANCE_ORDER = ['use','air1','dishwasher1','furnace1','kitchenapp1','microwave1','refrigerator1']

### creating the tensor data

In [7]:
t = np.empty((len(use_homes), len(APPLIANCE_ORDER), 2*12))
t[:] = np.NaN
for i, home in enumerate(use_homes):
    for j, appliance in enumerate(APPLIANCE_ORDER):
        try:
            t[i, j, :12] = d[2016][home][appliance].values
            t[i, j, 12:] = d[2017][home][appliance].values
        except:
            pass
t.shape

(214, 7, 24)

### STF routine

In [22]:
def factorise(tensor, r, random_seed=0, num_iter=400, eps=1e-8, lr=1):
    np.random.seed(random_seed)
    args_num = [1, 2, 3]

    def cost(tensor, home, appliance,  month):
        pred = np.einsum('Hr, Ar, Tr ->HAT', home, appliance, month)
        mask = ~np.isnan(tensor)
        error = (pred - tensor)[mask].flatten()
        return np.sqrt((error ** 2).mean())

    mg = multigrad(cost, argnums=args_num)
    sizes = [(x, r) for x in tensor.shape]
    home = np.random.rand(*sizes[0])
    appliance = np.random.rand(*sizes[1])
    month = np.random.rand(*sizes[2])

    sum_home = np.zeros_like(home)
    sum_appliance = np.zeros_like(appliance)
    sum_month = np.zeros_like(month)

    # GD procedure
    for i in range(num_iter):
        del_home, del_appliance, del_month = mg(tensor, home, appliance, month)

        sum_home += eps + np.square(del_home)
        lr_home = np.divide(lr, np.sqrt(sum_home))
        home -= lr_home * del_home

        sum_appliance += eps + np.square(del_appliance)
        lr_appliance = np.divide(lr, np.sqrt(sum_appliance))
        appliance -= lr_appliance * del_appliance

        sum_month += eps + np.square(del_month)
        lr_month = np.divide(lr, np.sqrt(sum_month))
        month -= lr_month * del_month

        
        # Projection to non-negative space
        home[home < 0] = 1e-8
        appliance[appliance < 0] = 1e-8
        month[month < 0] = 1e-8

        if i % 500 == 0:
            #print(cost(tensor, home, appliance, month), i)
            sys.stdout.flush()

    return home, appliance, month

### Creating the different subsets if data

In [9]:
# Test is last 44 homes
test_set = t[170:].copy()
# In test set, we don't know the appliance data
test_set[:, 1:, :] = np.NaN
# Train + Pool is the remaining set
train_pool_set = t[:170].copy()
# Initial train set is 10 homes
train_set = t[:10].copy()
# Initial pool set is 10-170 homes
pool_set = t[10:170].copy()
pool_homes = use_homes[10:170]
# Train + Test set (useful for factorisation)
train_test_set = np.vstack([train_set, test_set])

### Creating the set of (home, appliance) pairs for each month that can contain data and thus can be used for acquiring labels

In [10]:
p_m_h = {}
for month in range(12, 24):
    p_m_h[month] = []
    for home in pool_homes:
        for i, appliance in enumerate(APPLIANCE_ORDER[1:]):
            pool_home_index  = np.searchsorted(use_homes, home)
            if np.isfinite(t[pool_home_index, i+1, month]):
                p_m_h[month].append((home, appliance))

In [11]:
p_m_h[13][:5]

[(499, 'air1'),
 (499, 'furnace1'),
 (499, 'kitchenapp1'),
 (499, 'refrigerator1'),
 (503, 'air1')]

### Randomly adding 5 homes evey month from Pool

In [None]:
pool_use = np.empty((0, 7, 24))
pool_set_iter = p_m_h.copy()
pred_df_random = {}
pool_remove_ix = {}
pool_remove = {}
gt_df = {}

for cur_iteration, month in enumerate(range(12, 24)):
    
    # Randomly choose `n` to remove from pool and put in training set
    pool_remove_ix[cur_iteration] = np.random.choice(range(len(pool_set_iter[month])), size=5)
    pool_remove[month] = [pool_set_iter[month][x] for x in pool_remove_ix[cur_iteration]]
    
    # Remove these from future months pool too
    for month_f in range(month+1, 24):
        for x in pool_remove[month]:
            if x in pool_set_iter[month_f]:
                pool_set_iter[month_f].remove(x)
    
    
    # Getting data from `t` to create the pool_use_set
    pool_remove_homes = [x[0] for x in pool_remove[month]]
    pool_remove_homes_ix = np.searchsorted(use_homes, pool_remove_homes)
    
    cur_pool = t[pool_remove_homes_ix, :, :].copy()
    # All past data is unknown
    cur_pool[:, 1:, :month] = np.NaN
    
    # All future data for all appliances except the chosen one is known
    cur_pool[:, 1:, month:] = np.NaN
    for q, h in enumerate(pool_remove_homes_ix):
        appliance_num = APPLIANCE_ORDER.index(pool_remove[month][q][1])
        cur_pool[q, appliance_num, month:] = t[h, appliance_num, month:]
    
    pool_use = np.concatenate([pool_use, cur_pool])
    
    pool_train_test = np.concatenate([pool_use, train_test_set[:, :, :] ])
    
    print(pool_train_test.shape)
    h, a, m = factorise(pool_train_test[:, :, :month], 3)
    print(cur_iteration, month, len(pool_train_test), pool_remove[month])
    
    pred = np.einsum('Hr, Ar, Tr ->HAT', h, a, m)
    pred_df_random[month] = pred[len(train_set)+len(pool_use):,1:, month-1:month].reshape(-1, len(APPLIANCE_ORDER)-1)
    gt_df[month] = t[170:][:, 1:, month-1:month].reshape(len(test_set), len(APPLIANCE_ORDER)-1)

(59, 7, 24)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 12) (59, 3) (7, 3) (12, 3)
(59, 7, 

### active learning

In [None]:
pool_use = np.empty((0, 7, 24))
gt_df = {}
pred_df_active = {}
pool_remove_active = {}
pool_set_active = {}
all_seen = []

for cur_iteration, month in enumerate(range(12, 24)):
    
    # Get all the homes from the currently available pool

    pred_pool = {}
    psc = pool_set.copy()
    psc[:, 1:, :] = np.NaN
    month_contri = (pd.DataFrame(train_set[:, :, month]).mean()/pd.DataFrame(train_set[:, :, month]).mean()[0]).drop(0).to_dict()
    # Starting with current data and finding the variance amongst different #Latent factors
    for r in range(2, 8):
        print(month, r)
        pool_train = np.concatenate([psc, train_set[:, :, :] ])
        h, a, m =factorise(pool_train[:, :, :month], r)
        pred_pool[r] = np.einsum('Hr, Ar, Tr ->HAT', h, a, m)[:len(pool_set)]
        
    # Finding the top-most `n` highly variance homes
    # measure = std/contri
    temp = []
    for appliance_num, appliance in enumerate(APPLIANCE_ORDER[1:]):
        y = pd.DataFrame({r:pd.Series(pred_pool[r][:, appliance_num+1, -1])for r in range(2, 8)}).std(axis=1)/month_contri[appliance_num+1]
        y.index = [(x, appliance) for x in pool_homes]
        temp.append(y)
        
    pool_remove_active[month] = pd.concat(temp).sort_values(ascending=False).drop(all_seen).head(10).index.tolist() 
    
    all_seen.extend(pool_remove_active[month])
    all_seen = list(set(all_seen))
    
    # Getting data from `t` to create the pool_use_set
    pool_remove_homes_active = [x[0] for x in pool_remove_active[month]]
    pool_remove_homes_active_ix = np.searchsorted(use_homes, pool_remove_homes_active)
    
    cur_pool = t[pool_remove_homes_active_ix, :, :].copy()
    # All past data is unknown
    cur_pool[:, 1:, :month] = np.NaN
    
    # All future data for all appliances except the chosen one is known
    cur_pool[:, 1:, month:] = np.NaN
    for q, h in enumerate(pool_remove_homes_ix):
        appliance_num = APPLIANCE_ORDER.index(pool_remove[month][q][1])
        cur_pool[q, appliance_num, month:] = t[h, appliance_num, month:]
    
    pool_use = np.concatenate([pool_use, cur_pool])
    
    pool_train_test = np.concatenate([pool_use, train_test_set[:, :, :] ])

    h, a, m =factorise(pool_train_test[:, :, :month], 3)
    print(cur_iteration, month, len(pool_train_test), pool_remove_active[month])
    
    pred = np.einsum('Hr, Ar, Tr ->HAT', h, a, m)
    pred_df_active[month] = pred[len(train_set)+len(pool_use):,1:, month-1:month].reshape(-1, len(APPLIANCE_ORDER)-1)
    gt_df[month] = t[170:][:, 1:, month-1:month].reshape(len(test_set), len(APPLIANCE_ORDER)-1)

In [None]:
random_error = {}
active_error = {}
for month in range(12, 24):
    random_error[month] = (pd.DataFrame(pred_df_random[month])-pd.DataFrame(gt_df[month])).abs().mean()
    active_error[month] = (pd.DataFrame(pred_df_active[month])-pd.DataFrame(gt_df[month])).abs().mean()


In [None]:
pd.DataFrame(random_error)

In [None]:
pd.DataFrame(active_error)

In [None]:
re = pd.DataFrame(random_error).iloc[5]
ac = pd.DataFrame(active_error).iloc[5]
pd.DataFrame({"Random":re, "Active":ac}).plot(kind='bar')