# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (18,8)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from math import floor
from dataset import Sidekick
from model import LeastSquaresMixture

DATA_DIR = "../data/sidekick"

def subsample(t0, t1, n_samples):
    t = t1 - t0
    if n_samples >= t:
        return range(t0, t1)
    samples = range(t0, t1, int(np.ceil(t / float(n_samples))))
    return samples

## Load and split data

In [2]:
sk = Sidekick()
sk.load(light=True)
projects_train, projects_test = sk.split()

Loading light data set (1000 data points)...
Data loaded.


## Data processing

In [15]:
N = 1000
N_train = int(floor(0.8*N))
seed = 2
t0 = 1
t1 = 800
n_samples = 50
T = 999

samples = subsample(t0, t1, n_samples)
t = len(samples)

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

#with open("../data/sidekick/light.pkl", 'wb') as f:
#    cp.dump(N_projects, f)

X_train = np.ndarray(shape=(len(projects_train), t), buffer=np.array([p.money[samples] for p in projects_train]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train]), axis=1)
X_test = np.ndarray(shape=(len(projects_test), t), buffer=np.array([p.money[samples] for p in projects_test]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
X_max = np.max(X_train, axis=0)
X_train = X_train / X_max[np.newaxis, :]

print "Training on %s projects" % len(X_train)
print "Testing on %s projects" % len(X_test)
print "Number of features: %s" % n_samples

Training on 800 projects
Testing on 200 projects
Number of features: 50


## Train Mixture of Least Squares

In [12]:
K = 3
beta = 0.01
epsilon = 1e-6
lam = 0.1
iterations = 100
random_restarts = 10

mls = LeastSquaresMixture(X_train, y_train, K=K)
mls.train( beta=beta, lam=lam, iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Improved solution!
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Improved solution!
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


## Visualization

In [13]:
print mls.beta
print mls.pi
print mls.w

21.4481909698
[ 0.91472513  0.0075      0.07777487]
[[ 0.05230827  5.71761604  0.68591947]
 [-0.25623272 -1.9801125   0.26092939]
 [ 0.56060672 -0.1781158   0.38694178]
 [-0.26526315  0.12083942  0.54883646]
 [ 0.29854169 -0.43308424  0.65753377]
 [-0.23970882 -0.26636136  0.43409235]
 [-0.09692515 -0.30545504  0.39386725]
 [ 0.15493572 -0.447968    0.5421059 ]
 [-0.05845014 -0.76700078  0.60397715]
 [-0.13877209 -0.25151622  0.58303529]
 [-0.23094685  0.13994951  0.55270921]
 [ 0.07084116  0.33290771  0.50715741]
 [ 0.0955462   0.19316133  0.47781605]
 [-0.01902633  0.34074852  0.43084139]
 [-0.8874569   0.4519902   0.48565974]
 [-0.96959991  0.12993388  0.52532706]
 [-0.03012968 -0.08643593  0.47075741]
 [-0.45111608 -0.23115251  0.53274621]
 [-0.54823066 -0.18724983  0.49855703]
 [-0.43510983 -0.0563338   0.49702731]
 [-0.20452385 -0.14764118  0.4511237 ]
 [-0.08307296 -0.1059424   0.47502477]
 [ 0.05893736 -0.07839556  0.48063715]
 [-0.15992458 -0.19892507  0.50151257]
 [-0.1869539

## Evaluation

In [14]:
for i, x_new in enumerate(X_test):
    x_new = x_new / X_max
    y_new, _ = mls.predict(list(x_new))
    print "Predicted: %s | Actual: %s" % (y_new, y_test[i])

Predicted: [ 0.62809113  6.10354412  1.02163308] | Actual: [ 1.0474]
Predicted: [ 0.0686812   5.73474671  0.71332779] | Actual: [ 0.01470588]
Predicted: [ 0.05290713  5.71801939  0.68611314] | Actual: [ 0.00041509]
Predicted: [ 0.19239827  5.83535252  0.83345835] | Actual: [ 0.12817187]
Predicted: [ 1.02082389  6.42072641  1.48098366] | Actual: [ 1.45708539]
Predicted: [  8.95615977  12.97196321  12.32201468] | Actual: [ 11.01256378]
Predicted: [ 0.45662503  6.02289435  1.14067979] | Actual: [ 0.3639]
Predicted: [ 1.29433661  6.49241181  1.50356512] | Actual: [ 1.44375]
Predicted: [ 0.42272841  6.05520355  0.97236765] | Actual: [ 1.007]
Predicted: [ 1.96585738  7.33142061  2.5339013 ] | Actual: [ 1.37]
Predicted: [ 0.05230827  5.71761604  0.68591947] | Actual: [ 0.]
Predicted: [ 0.22976991  5.8821902   0.92057683] | Actual: [ 0.14552632]
Predicted: [ 0.50061533  5.99213813  1.36782603] | Actual: [ 1.07073714]
Predicted: [ 0.65483228  6.21496433  1.43918136] | Actual: [ 1.03548387]
Pred