# Sidekick - Mixture of Least Squares
We train a mixture of least squares, experimenting with different number of components.

In [1]:
%matplotlib inline
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.abspath('../utils/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (16,14)
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import pickle as cp
import matplotlib.pyplot as plt
from dataset import Sidekick
from model import LeastSquaresMixture
from math import floor

DATA_DIR = "../data/sidekick"

def subsample(t, granularity):
    if granularity > 1.0 or granularity <= 0:
        raise ValueError("granularity must be in ]0, 1]")
    t0 = 1
    n_samples = int(np.ceil(granularity * t))
    if n_samples == 1:
        return [t]
    else:
        return np.linspace(t0, t, n_samples, dtype=int)

## Load and split data

In [2]:
sk = Sidekick()
sk.load()
projects_train, projects_test = sk.split(threshold=0.7)
total = len(projects_train) + len(projects_test)

Loading data set...
Data loaded.


## Data processing

In [62]:
seed = 2
t = 25
granularity = 0.1
T = 999

samples = subsample(t, granularity)
t = len(samples)
#print("Samples ({}): {}".format(t, samples))

#N_projects = sk.choose_n_projects(n=N, seed=seed)
#projects_train = N_projects[:N_train]
#projects_test = N_projects[N_train:]

# Remove outliers
outlier_threshold = 2  # Ignore the project whose total pledged money if more that o_t times their goal
projects_train_filtered = [p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]
projects_test_filtered = [p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]
#projects_test_filtered = projects_test

X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([p.money[samples] for p in projects_train_filtered]), dtype=float) 
y_train = np.expand_dims(np.array([p.money[T] for p in projects_train_filtered]), axis=1)
X_test = np.ndarray(shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float) 
y_test = np.expand_dims(np.array([p.money[T] for p in projects_test_filtered]), axis=1)

# Required to contain the prediction in a reasonable range
# The problem arises when evaluating the likelihood in the expression for gamma_nk
#X_max = np.max(X_train, axis=0)
#X_train = X_train / X_max[np.newaxis, :]
# Apply same preprocessing to testing set
#X_test = X_test / X_max[np.newaxis, :]  

total_filtered = len(X_train) + len(X_test)
print("Removed %0.2f%% outliers" % (100 - total_filtered / total * 100))
print("Training on %s projects (%0.2f%%)" % (len(X_train), len(X_train) / total_filtered * 100))
print("Testing on %s projects (%0.2f%%)" % (len(X_test), len(X_test) / total_filtered * 100))
print("Number of features: %s" % t)

Removed 7.11% outliers
Training on 10343 projects (70.04%)
Testing on 4425 projects (29.96%)
Number of features: 3


## Training

### Simple Training

In [63]:
K = 2
beta = 0.0001
epsilon = 1
lam = 0
iterations = 25
random_restarts = None

mls1 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls1.train(verbose=True)

print(mls1)

* Starting EM algorithm for mixture of K=2 least squares models
* Beta = [ 0.0001  0.0001]
* Lambda = 0
* Running at most 25 iterations
* Stopping when complete likelihood improves less than 1
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.50      0.50      0.62      0.93      0.32      0.42      0.00      0.00
 -47631.42      0.50      0.50      0.41      0.13      0.41      0.13      1.84      1.84
   1021.39      0.50      0.50      0.41      0.13      0.41      0.13      2.23      2.23
   1558.85      0.50      0.50      0.41      0.13      0.41      0.13      2.23      2.23
   1558.85      0.50      0.50      0.41      0.13      0.41      0.13      2.23      2.23
Model:        LeastSquaresMixture (2 components)
Likelihood:   1558.8542171807542
Beta:         [ 2.2287282   2.22873099]
Lambda:       0
Pi:           [ 0.49999969  0.50000031]
Weights norm: [3.0030869865858283, 3.0030395073199494]
[[ 0.41229054  0.41228747]
 

In [32]:
K = 2
beta = 0.001
#beta = 0.0001
epsilon = 1e0
lam = 0
iterations = 25
random_restarts = None

mls2 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls2.train(verbose=True)

print(mls2)

* Starting EM algorithm for mixture of K=2 least squares models
* Beta = [ 0.001  0.001]
* Lambda = 0
* Running at most 1000 iterations
* Stopping when complete likelihood improves less than 1.0
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.50      0.50      0.97      0.41      0.57      0.67      0.00      0.00
 -43940.95      0.47      0.53      0.15     -0.12      0.17     -0.12      0.00      0.00
 -41647.29      0.46      0.54      0.16     -0.13      0.16     -0.13      5.27      6.23
   6485.97      0.45      0.55      0.17     -0.13      0.15     -0.13      5.18      6.39
   6516.00      0.43      0.57      0.18     -0.14      0.14     -0.12      5.04      6.55
   6537.73      0.42      0.58      0.19     -0.15      0.14     -0.11      4.87      6.76
   6568.95      0.40      0.60      0.20     -0.17      0.13     -0.10      4.67      7.02
   6614.99      0.38      0.62      0.21     -0.18      0.13     -0.09      4

In [60]:
K = 2
beta = 1 / t
#beta = 0.0001
epsilon = 1e0
lam = 0
iterations = 25
random_restarts = None

mls3 = LeastSquaresMixture(X_train, y_train, 
                          K=K, beta=beta, lam=lam, 
                          iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
mls3.train(verbose=True)

print(mls3)

* Starting EM algorithm for mixture of K=2 least squares models
* Beta = [ 0.0125  0.0125]
* Lambda = 0
* Running at most 25 iterations
* Stopping when complete likelihood improves less than 1.0
       Obj       pi1       pi2       w11       w12       w21       w22     beta1     beta2
      -inf      0.50      0.50      0.95      0.32      0.98      0.49      0.01      0.01
 -40976.04      0.57      0.43      0.09      0.04      0.07      0.07      0.00      0.00
 -32628.10      0.60      0.40      0.08      0.05      0.08      0.05     13.42      8.97
  10034.80      0.62      0.38      0.06      0.03      0.11      0.06     14.10      8.51
  10153.33      0.65      0.35      0.05      0.02      0.13      0.08     15.04      8.01
  10310.35      0.69      0.31      0.05      0.02      0.15      0.10     16.14      7.38
  10494.93      0.72      0.28      0.05      0.02      0.18      0.11     17.47      6.65
  10716.22      0.76      0.24      0.04      0.02      0.22      0.13     19

## Testing

### Full Test Set

In [18]:
rmse_failed, rmse_success, rmse, accuracy = mls1.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (2 components)...
Data point (4425/4425): [####################] 100% Elapsed time: 0:00:01
Accuracy     : 0.7557062146892656
RMSE         : 0.306295859076
RMSE failed  : 0.174059819914
RMSE success : 0.420801102719


In [33]:
rmse_failed, rmse_success, rmse, accuracy = mls2.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (2 components)...
Data point (4424/4424): [####################] 100% Elapsed time: 0:00:02
Accuracy     : 0.7597197106690777
RMSE         : 0.312315056388
RMSE failed  : 0.172204418002
RMSE success : 0.431933022805


In [54]:
rmse_failed, rmse_success, rmse, accuracy = mls3.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (2 components)...
Data point (4423/4423): [####################] 100% Elapsed time: 0:00:02
Accuracy     : 0.8222925616097672
RMSE         : 0.241792560369
RMSE failed  : 0.186546328651
RMSE success : 0.299186789879


In [61]:
rmse_failed, rmse_success, rmse, accuracy = mls3.evaluate(X_test, y_test, verbose=True)

Evaluating model LeastSquaresMixture (2 components)...
Data point (4423/4423): [####################] 100% Elapsed time: 0:00:01
Accuracy     : 0.8182229256160977
RMSE         : 0.223827117287
RMSE failed  : 0.159197072235
RMSE success : 0.287365328636


### Single Point

In [15]:
test = 12
x_new = X_test[test]
y_new = mls3.predict(x_new, posteriors=True)
y_true = y_test[test]
print("Predicted:  %s" % y_new)
#print("Posteriors: %s" % ["%0.2f%%" % (p * 100) for p in y_posteriors])
print("Actual:     %s" % y_true)

Predicted:  0.844715677913
Actual:     [ 1.02310255]
