# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (18,8)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from math import floor
from dataset import Sidekick
from model import LeastSquaresMixture
from collections import Counter

DATA_DIR = "../data/sidekick"

def subsample(t0, t1, n_samples):
    t = t1 - t0
    if n_samples >= t:
        return range(t0, t1)
    samples = range(t0, t1, int(np.ceil(t / float(n_samples))))
    return samples

## Load and split data

In [2]:
sk = Sidekick()
sk.load(light=True)
projects_train, projects_test = sk.split()

Loading light data set (1000 data points)...
Data loaded.


## Data processing

In [7]:
N = 1000
N_train = int(floor(0.8*N))
seed = 2
t0 = 1
t1 = 500
n_samples = 250
T = 999

samples = subsample(t0, t1, n_samples)
t = len(samples)

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

#with open("../data/sidekick/light.pkl", 'wb') as f:
#    cp.dump(N_projects, f)

X_train = np.ndarray(shape=(len(projects_train), t), buffer=np.array([p.money[samples] for p in projects_train]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train]), axis=1)
X_test = np.ndarray(shape=(len(projects_test), t), buffer=np.array([p.money[samples] for p in projects_test]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
X_max = np.max(X_train, axis=0)
X_train = X_train / X_max[np.newaxis, :]

print("Training on %s projects" % len(X_train))
print("Testing on %s projects" % len(X_test))
print("Number of features: %s" % n_samples)

Training on 800 projects
Testing on 200 projects
Number of features: 250


## Train Mixture of Least Squares

In [8]:
K = 5
beta = 0.01
epsilon = 1e-6
lam = 0.01
iterations = 1000
random_restarts = 20

mls = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls.train()
print("Model trained!")

Random restarts (14/20): [##############      ] 70%Hitting maximum iteration (1000)
Random restarts (20/20): [####################] 100% Elapsed time: 0:07:41
Model trained!


## Display model

In [9]:
print(mls)

Model: LeastSquaresMixture (5 components)
Likelihood: -50.1495934604906
Beta: 52.448182377211225
Lambda: 0.01
Pi: [ 0.10002485  0.24596787  0.17418854  0.25829067  0.22152806]
Weights: (norm: [20.725831366611295, 10.475542640297597, 5.7169088767106757, 9.2497470750125679, 6.8729804510566126])



## Evaluation

In [10]:
X_test = X_test / X_max[np.newaxis, :]  # Apply same preprocessing as training
mls.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (5 components)...
Data point (200/200): [####################] 100% Elapsed time: 0:00:05
Accuracy: 0.8
RMSE    : 12.8334579643
Chosen  : Counter({3: 103, 4: 37, 1: 32, 0: 21, 2: 7})


(12.833457964256112, 0.8, Counter({0: 21, 1: 32, 2: 7, 3: 103, 4: 37}))