# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (16,14)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from dataset import Sidekick
from model import LeastSquaresMixture
from math import floor

DATA_DIR = "../data/sidekick"

def subsample(t, granularity):
    if granularity > 1.0 or granularity <= 0:
        raise ValueError("granularity must be in ]0, 1]")
    t0 = 1
    n_samples = int(np.ceil(granularity * t))
    if n_samples == 1:
        return [t]
    else:
        return np.linspace(t0, t, n_samples, dtype=int)

## Load and split data

In [2]:
sk = Sidekick()
sk.load()
projects_train, projects_test = sk.split(threshold=0.7)
total = len(projects_train) + len(projects_test)

Loading data set...
Data loaded.


## Data processing

In [3]:
seed = 2
t = 25
granularity = 0.1
T = 999

samples = subsample(t, granularity)
t = len(samples)
#print("Samples ({}): {}".format(t, samples))

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

# Remove outliers
outlier_threshold = 2  # Ignore the project whose total pledged money if more that o_t times their goal
projects_train_filtered = [p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]
projects_test_filtered = [p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]
#projects_test_filtered = projects_test

X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([p.money[samples] for p in projects_train_filtered]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train_filtered]), axis=1)
X_test = np.ndarray(shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test_filtered]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
#X_max = np.max(X_train, axis=0)
#X_train = X_train / X_max[np.newaxis, :]
# Apply same preprocessing to testing set
#X_test = X_test / X_max[np.newaxis, :]  

total_filtered = len(X_train) + len(X_test)
print("Removed %0.2f%% outliers" % (100 - total_filtered / total * 100))
print("Training on %s projects (%0.2f%%)" % (len(X_train), len(X_train) / total_filtered * 100))
print("Testing on %s projects (%0.2f%%)" % (len(X_test), len(X_test) / total_filtered * 100))
print("Number of features: %s" % t)

Removed 7.11% outliers
Training on 10343 projects (70.04%)
Testing on 4425 projects (29.96%)
Number of features: 3


## Training

### Simple Training

In [34]:
K = 3
beta = 1 / np.var(y_train) / K
epsilon = 1
lam = 0
iterations = 25
random_restarts = None

mls1 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls1.train(verbose=True)

print(mls1)

* Starting EM algorithm for mixture of K=3 least squares models
* Beta = [ 1.  1.  1.]
* Lambda = 0
* Running at most 25 iterations
* Stopping when complete likelihood improves less than 1
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.33      0.33      0.64      0.84      0.34      0.26      1.00      1.00
  -1439.76      0.33      0.33      0.43      0.14      0.36      0.11      1.22      1.22
   -382.31      0.33      0.33      0.42      0.14      0.40      0.14      1.48      1.48
    315.52      0.33      0.33      0.41      0.15      0.41      0.16      1.49      1.48
    323.74      0.33      0.33      0.41      0.16      0.41      0.18      1.49      1.48
    324.33      0.33      0.33      0.41      0.17      0.41      0.21      1.49      1.48
Model:        LeastSquaresMixture (3 components)
Likelihood:   324.33057832812347
Beta:         [ 1.4877509   1.48042175  1.49007769]
Lambda:       0
Pi:           [ 0.333707

In [25]:
K = 5
beta = 1 / np.var(y_train) / K
#beta = 0.0001
epsilon = 1e0
lam = 0
iterations = 25
random_restarts = None

mls2 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls2.train(verbose=True)

print(mls2)

* Starting EM algorithm for mixture of K=5 least squares models
* Beta = [ 0.61506604  0.61506604  0.61506604  0.61506604  0.61506604]
* Lambda = 0
* Running at most 25 iterations
* Stopping when complete likelihood improves less than 1.0
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.20      0.20      0.50      0.01      0.98      0.45      0.62      0.62
  -3591.33      0.20      0.19      0.41      0.09      0.49      0.07      0.63      0.60
  -3233.79      0.21      0.19      0.41      0.10      0.42      0.10      0.92      0.85
  -1646.26      0.21      0.19      0.41      0.10      0.42      0.13      0.94      0.84
  -1617.75      0.21      0.18      0.41      0.09      0.42      0.15      0.95      0.82
  -1607.03      0.22      0.18      0.41      0.07      0.42      0.18      0.98      0.79
  -1586.98      0.23      0.17      0.41      0.05      0.42      0.21      1.01      0.75
  -1548.81      0.24      0.16   

In [28]:
K = 10
beta = 1 / np.var(y_train) / K
#beta = 0.0001
epsilon = 1e0
lam = 0
iterations = 25
random_restarts = None

mls3 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls3.train(verbose=True)

print(mls3)

* Starting EM algorithm for mixture of K=10 least squares models
* Beta = [ 0.30753302  0.30753302  0.30753302  0.30753302  0.30753302  0.30753302
  0.30753302  0.30753302  0.30753302  0.30753302]
* Lambda = 0
* Running at most 25 iterations
* Stopping when complete likelihood improves less than 1.0
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.10      0.10      0.84      0.66      0.75      0.25      0.31      0.31
  -6632.16      0.10      0.10      0.44      0.12      0.43      0.11      0.30      0.30
  -6525.34      0.10      0.10      0.41      0.13      0.41      0.13      0.44      0.45
  -4701.62      0.10      0.10      0.41      0.13      0.41      0.13      0.44      0.45
  -4694.51      0.10      0.10      0.41      0.13      0.41      0.13      0.44      0.45
  -4693.07      0.10      0.10      0.41      0.13      0.41      0.13      0.44      0.45
  -4690.11      0.10      0.10      0.41      0.13      0.41  

## Testing

### Full Test Set

In [29]:
rmse_failed, rmse_success, rmse, accuracy = mls1.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (3 components)...
Data point (4425/4425): [####################] 100% Elapsed time: 0:00:01
Accuracy     : 0.6607909604519774
RMSE         : 0.476572529804
RMSE failed  : 0.363958046235
RMSE success : 0.592610627489


In [32]:
rmse_failed, rmse_success, rmse, accuracy = mls2.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (5 components)...
Data point (4425/4425): [####################] 100% Elapsed time: 0:00:01
Accuracy     : 0.6612429378531074
RMSE         : 0.47674950404
RMSE failed  : 0.364561737375
RMSE success : 0.592453165677


In [33]:
rmse_failed, rmse_success, rmse, accuracy = mls3.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (10 components)...
Data point (4425/4425): [####################] 100% Elapsed time: 0:00:02
Accuracy     : 0.6583050847457627
RMSE         : 0.475436049003
RMSE failed  : 0.365833458882
RMSE success : 0.588976572683


In [35]:
rmse_failed, rmse_success, rmse, accuracy = mls1.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (3 components)...
Data point (4425/4425): [####################] 100% Elapsed time: 0:00:07
Accuracy     : 0.6508474576271186
RMSE         : 0.47336519643
RMSE failed  : 0.372801056501
RMSE success : 0.579316591193


### Single Point

In [15]:
test = 12
x_new = X_test[test]
y_new = mls3.predict(x_new, posteriors=True)
y_true = y_test[test]
print("Predicted:  %s" % y_new)
#print("Posteriors: %s" % ["%0.2f%%" % (p * 100) for p in y_posteriors])
print("Actual:     %s" % y_true)

Predicted:  0.844715677913
Actual:     [ 1.02310255]
