# Sidekick - Multi-Pojects Predictions
Scratch work to perform predictions for a project by considering multiple projects.

In [1]:
%matplotlib inline
import os
import sys
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.path.insert(0, os.path.abspath('../src/')) # Add sibling to Python path
sys.stdout.flush() # Print output on the fly in Notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (18,8)
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['legend.fontsize'] = 16
from IPython.display import display
import numpy as np
import GPy
import cPickle as cp
import matplotlib.pyplot as plt
from math import floor
from dataset import Sidekick 

DATA_DIR = "../data/kickstarter-etter-cosn2013"



## Load data

In [2]:
sk = Sidekick()
sk.load()

Loading projects...
Loading statuses...
Data loaded.


## Prepare data
Keep `T`% of the projects for training and `(100-T)`% for testing.

In [91]:
projects = sk.extract_n_projects(n=1000)
num_observations = 1000
T = 0.8

threshold = int(floor(T * len(projects)))
projects_train = projects[:threshold]
projects_test = projects[threshold:]
X_train = np.ndarray(shape=(num_observations, 1), buffer=np.arange(num_observations), dtype=int) 
Y_train = np.array([[status[1] for status in project['status']] for project in projects_train]).transpose()
assert X.shape[0] == Y.shape[0]


## Train model
Train a GP with a given kernel.

In [92]:
kernel = GPy.kern.RBF(input_dim=1)
m = GPy.models.GPRegression(X_train, Y_train, kernel)
m.optimize_restarts(num_restarts=10)
display(m)

Optimization restart 1/10, f = -2222508.1565
Optimization restart 2/10, f = -2222508.15655
Optimization restart 3/10, f = -2222508.15645
Optimization restart 4/10, f = -2222508.15655
Optimization restart 5/10, f = -2222508.15653
Optimization restart 6/10, f = -2222508.15655
Optimization restart 7/10, f = -2222508.15655
Optimization restart 8/10, f = -2222508.15643
Optimization restart 9/10, f = -2222508.15655
Optimization restart 10/10, f = -2222508.15644


GP_regression.,Value,Constraint,Prior,Tied to
rbf.variance,0.584333509992,+ve,,
rbf.lengthscale,11.6457237476,+ve,,
Gaussian_noise.variance,7.18772515605e-05,+ve,,


## Test unseen project
We observe 80% of the project pledged money and we predict then the last point, that is the total amount of money pledged at the end of the project. If it is greater than 1, then the project has been funded.

In [129]:
new_project = projects_test[50]

goal = new_project['project'][1]
proportion = 0.5
last_observation = 600
observe = int(floor(num_observations * proportion))

X_observation = np.ndarray(shape=(observe, 1), buffer=np.arange(observe), dtype=int) 
Y_observation = np.ndarray(shape=(observe, 1), buffer=np.array([status[1] for status in new_project['status'][:observe]]), dtype=float) 
m.set_XY(X=X_observation, Y=Y_observation)
mean, var = m.predict(np.ndarray((1,1), buffer=np.array(last_observation), dtype=int))
predicted_pledged = mean[0][0]
predicted_var = var[0][0]
actual_pledged = new_project['status'][last_observation][1]
print "Goal: %s" % goal
print "Predicted success: %s (%.2f±%.2f)" % (str(predicted_pledged > 1), predicted_pledged, np.sqrt(predicted_var))
print "Actual success: %s (%.2f)" % (str(actual_pledged > 1), actual_pledged)

Goal: 5000
Predicted success: False (0.00±0.76)
Actual success: False (0.12)


## Evaluation
Run an evaluation on the full test set.

In [None]:
def evaluate_project(project):
    last_observation = 999
    goal = project['project'][1]
    correct = []
    for p in np.linspace(0.01, 0.99, 20):
        # Set the number of observed data
        observe = int(floor(num_observations * p))
        # Set the new observation in the model without modifying it
        X_observation = np.ndarray(shape=(observe, 1), buffer=np.arange(observe), dtype=int) 
        Y_observation = np.ndarray(shape=(observe, 1), buffer=np.array([status[1] for status in new_project['status'][:observe]]), dtype=float) 
        m.set_XY(X=X_observation, Y=Y_observation)
        # Predict the end of the project
        mean, var = m.predict(np.ndarray((1,1), buffer=np.array(last_observation), dtype=int))
        predicted_pledged = mean[0][0]
        predicted_success = predicted_pledged > 1
        predicted_var = var[0][0]
        actual_pledged = new_project['status'][last_observation][1]
        actual_success = actual_pledged > 1
        correct.append(predicted_pledged == actual_success)
    return correct

result = []
total = len(projects_test)
print total
for i, project in enumerate(projects_test):
    print "%.2f%%" % (i / float(total) * 100)
    result.append(evaluate_project(project))

200
0.00%
0.50%
1.00%
1.50%
2.00%
2.50%
3.00%
3.50%
4.00%
4.50%
5.00%
5.50%
6.00%
6.50%
7.00%
7.50%
8.00%
8.50%
9.00%
9.50%
10.00%
10.50%
11.00%
11.50%
12.00%
12.50%
13.00%
13.50%
14.00%
14.50%
15.00%
15.50%
16.00%
16.50%
17.00%
17.50%
18.00%
18.50%
19.00%
19.50%
20.00%
20.50%
21.00%
21.50%
22.00%
22.50%
23.00%
23.50%
24.00%
24.50%
25.00%
25.50%
26.00%
26.50%
27.00%
27.50%
28.00%
28.50%
29.00%
29.50%
30.00%
30.50%
31.00%
31.50%
32.00%
32.50%
33.00%
33.50%
34.00%
34.50%
35.00%
35.50%
36.00%
36.50%
37.00%
37.50%
38.00%
38.50%
39.00%
39.50%
40.00%
40.50%
41.00%
41.50%
42.00%
42.50%
43.00%
43.50%
44.00%
44.50%
45.00%
45.50%
46.00%
46.50%
47.00%
47.50%
48.00%
48.50%
49.00%
49.50%
50.00%
50.50%
51.00%
51.50%
52.00%
52.50%
53.00%
53.50%
54.00%
54.50%
55.00%
55.50%
56.00%
56.50%
57.00%
57.50%
58.00%
58.50%

In [132]:
np.linspace(0, 0.99, 20)

array([ 0.        ,  0.05210526,  0.10421053,  0.15631579,  0.20842105,
        0.26052632,  0.31263158,  0.36473684,  0.41684211,  0.46894737,
        0.52105263,  0.57315789,  0.62526316,  0.67736842,  0.72947368,
        0.78157895,  0.83368421,  0.88578947,  0.93789474,  0.99      ])