# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (18,8)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from math import floor
from dataset import Sidekick
from model import LeastSquaresMixture

DATA_DIR = "../data/sidekick"

def subsample(t0, t1, n_samples):
    t = t1 - t0
    if n_samples >= t:
        return range(t0, t1)
    samples = range(t0, t1, int(np.ceil(t / float(n_samples))))
    return samples

## Load and split data

In [2]:
sk = Sidekick()
sk.load(light=True)
projects_train, projects_test = sk.split()

Loading light data set (1000 data points)...
Data loaded.


## Data processing

In [3]:
N = 1000
N_train = int(floor(0.8*N))
seed = 2
t0 = 1
t1 = 800
n_samples = 50
T = 999

samples = subsample(t0, t1, n_samples)
t = len(samples)

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

#with open("../data/sidekick/light.pkl", 'wb') as f:
#    cp.dump(N_projects, f)

X_train = np.ndarray(shape=(len(projects_train), t), buffer=np.array([p.money[samples] for p in projects_train]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train]), axis=1)
X_test = np.ndarray(shape=(len(projects_test), t), buffer=np.array([p.money[samples] for p in projects_test]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
X_max = np.max(X_train, axis=0)
X_train = X_train / X_max[np.newaxis, :]

print("Training on %s projects" % len(X_train))
print("Testing on %s projects" % len(X_test))
print("Number of features: %s" % n_samples)

Training on 800 projects
Testing on 200 projects
Number of features: 50


## Train Mixture of Least Squares

In [4]:
K = 3
beta = 0.01
epsilon = 1e-6
lam = 0.1
iterations = 100
random_restarts = 100

mls = LeastSquaresMixture(X_train, y_train, K=K)
mls.train( beta=beta, lam=lam, iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
print("Model trained!")

Random restarts: [####################] 100% (100/100) Elapsed time: 0:01:17
Model trained!


## Visualization

In [5]:
print(mls.marginal_likelihood)
print(mls.beta)
print(mls.pi)
print(mls.w)

-138.4352124984955
13.860490010179065
[ 0.02726923  0.96288334  0.00984744]
[[  1.68119163e-01   9.28654316e-02   5.46441732e+00]
 [  2.76368143e-02  -2.14177700e-01  -1.02451511e+00]
 [  5.62872065e-01   3.18771466e-01  -1.18602283e+00]
 [  4.29117375e-01   1.85107055e-01  -9.04453861e-01]
 [  3.94353086e-01   1.42204097e-01  -7.70272084e-01]
 [  3.93827659e-01  -3.75849607e-01   1.36905167e-01]
 [  4.37901554e-01  -3.62653922e-01  -1.05472997e-01]
 [  5.71929352e-01   2.72935602e-01  -6.63451388e-02]
 [  5.51978136e-01   2.11562294e-02  -4.08782651e-01]
 [  5.76930341e-01   2.44193822e-03   2.41439902e-01]
 [  5.98004220e-01  -9.06415311e-02   5.02275314e-01]
 [  5.57306327e-01   1.77282826e-01   6.33750778e-01]
 [  5.89058702e-01  -5.70118281e-02   3.96723253e-01]
 [  5.84058837e-01  -2.50432600e-01   5.51688079e-01]
 [  5.60719426e-01  -1.05537027e+00   5.32647254e-01]
 [  5.82593204e-01  -8.72134539e-01   1.22382170e-01]
 [  4.41193129e-01   6.84079929e-02  -9.71695055e-02]
 [  4.

## Evaluation

In [13]:
for i, x_new in enumerate(X_test):
    x_new = x_new / X_max
    y_new, _ = mls.predict(list(x_new))
    print("Predicted: %s | Actual: %s" % (y_new, y_test[i]))

Predicted: [ 0.62737724  1.02078797  6.10354412] | Actual: [ 1.0474]
Predicted: [ 0.06809869  0.71237923  5.73474671] | Actual: [ 0.01470588]
Predicted: [ 0.05232462  0.68517098  5.7180194 ] | Actual: [ 0.00041509]
Predicted: [ 0.19182241  0.83250293  5.83535253] | Actual: [ 0.12817187]
Predicted: [ 1.0201619   1.48000254  6.42072641] | Actual: [ 1.45708539]
Predicted: [  8.95543047  12.31929952  12.9719632 ] | Actual: [ 11.01256378]
Predicted: [ 0.45602537  1.13968912  6.02289436] | Actual: [ 0.3639]
Predicted: [ 1.29319801  1.50306064  6.49241181] | Actual: [ 1.44375]
Predicted: [ 0.42208534  0.97131498  6.05520356] | Actual: [ 1.007]
Predicted: [ 1.96570288  2.53267677  7.33142061] | Actual: [ 1.37]
Predicted: [ 0.05172586  0.68497706  5.71761605] | Actual: [ 0.]
Predicted: [ 0.22918665  0.91957172  5.8821902 ] | Actual: [ 0.14552632]
Predicted: [ 0.50006202  1.36661405  5.99213813] | Actual: [ 1.07073714]
Predicted: [ 0.65410612  1.43811728  6.21496433] | Actual: [ 1.03548387]
Pred