# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (18,8)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from math import floor
from dataset import Sidekick
from model import LeastSquaresMixture

DATA_DIR = "../data/sidekick"

def subsample(t0, t1, n_samples):
    t = t1 - t0
    if n_samples >= t:
        return range(t0, t1)
    samples = range(t0, t1, int(np.ceil(t / float(n_samples))))
    return samples

## Load and split data

In [2]:
sk = Sidekick()
sk.load(light=True)
projects_train, projects_test = sk.split()

Loading light data set (1000 data points)...
Data loaded.


## Data processing

In [3]:
N = 1000
N_train = int(floor(0.8*N))
seed = 2
t0 = 1
t1 = 500
n_samples = 20
T = 999

samples = subsample(t0, t1, n_samples)
t = len(samples)

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

#with open("../data/sidekick/light.pkl", 'wb') as f:
#    cp.dump(N_projects, f)

X_train = np.ndarray(shape=(len(projects_train), t), buffer=np.array([p.money[samples] for p in projects_train]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train]), axis=1)
X_test = np.ndarray(shape=(len(projects_test), t), buffer=np.array([p.money[samples] for p in projects_test]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
X_max = np.max(X_train, axis=0)
X_train = X_train / X_max[np.newaxis, :]

print("Training on %s projects" % len(X_train))
print("Testing on %s projects" % len(X_test))
print("Number of features: %s" % n_samples)

Training on 800 projects
Testing on 200 projects
Number of features: 20


## Train Mixture of Least Squares

In [4]:
K = 5
beta = 0.01
epsilon = 1e-6
lam = 0.001
iterations = 1000
random_restarts = 50

mls = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls.train()
print("Model trained!")

Random restarts: [####                ] 22% (11/50)Hitting maximum iteration (1000)
Random restarts: [####################] 100% (50/50) Elapsed time: 0:03:39
Model trained!


## Display model

In [5]:
print(mls)

Model: LeastSquaresMixture (5 components)
Likelihood: -161.65608041889175
Beta: 29.137008038537427
Lambda: 0.001
Pi: [ 0.26382687  0.0835386   0.29267189  0.28788951  0.07207314]
Weights: (norm: [32.47141286143782, 16.549558806540496, 21.239780959951947, 34.87394442639949, 69.994670438545711])
[[  8.30961614e-02   4.17253962e-02   2.52860526e-02   8.65289912e-02
    8.35397806e-01]
 [ -7.23018243e-01  -3.12065858e-01   4.09074077e-01   2.72456557e+00
   -1.32699462e+01]
 [  1.48684391e+00   3.73473770e+00   1.20360753e+00  -3.01448461e+00
   -2.51438713e+01]
 [  2.21417434e+00   5.12511170e+00  -5.24762571e+00  -7.22596211e+00
   -2.18720047e+01]
 [  3.84926250e+00  -5.12234716e+00   4.67558814e-01  -6.72877621e+00
    2.65309706e+01]
 [ -1.01266337e+01   8.74715706e-02   5.77996109e+00   6.56889110e+00
    1.07631782e+01]
 [ -7.59951932e+00  -2.30378768e+00  -2.72168855e+00   6.20535920e+00
    2.36097482e+01]
 [ -4.55086279e+00  -4.72384623e+00  -2.46970295e+00   1.09423238e+01
    1

## Evaluation

In [6]:
for i, x_new in enumerate(X_test):
    x_new = x_new / X_max
    y_new, _ = mls.predict(list(x_new))
    print("Predicted: %s | Actual: %s" % (y_new, y_test[i]))

Predicted: [ 0.36901265  0.3135137   0.27347273  0.37581619  1.17694726] | Actual: [ 1.0474]
Predicted: [ 0.10394068  0.05431004  0.03855352  0.12919134  0.87999036] | Actual: [ 0.01470588]
Predicted: [ 0.08309894  0.04172883  0.02531982  0.08659874  0.83555403] | Actual: [ 0.00041509]
Predicted: [ 0.24957133  0.17464034  0.14246885  0.24610418  1.0852868 ] | Actual: [ 0.12817187]
Predicted: [ 0.88901233  0.73081186  0.64656881  0.74395229  1.1544704 ] | Actual: [ 1.45708539]
Predicted: [ 10.41309868   8.85986577   8.37083317  12.43378643  12.41555004] | Actual: [ 11.01256378]
Predicted: [ 0.55420379  0.41081472  0.37251791  0.53890227  1.34125732] | Actual: [ 0.3639]
Predicted: [ 0.60906759  0.50041445  0.46496642  0.6697778   1.86081907] | Actual: [ 1.44375]
Predicted: [ 0.75050393  0.44411956  0.38923028  0.29387239  0.82764125] | Actual: [ 1.007]
Predicted: [ 2.44755672  2.17154014  2.11190667  3.94863048  6.92903291] | Actual: [ 1.37]
Predicted: [ 0.08309616  0.0417254   0.0252860