# Statistical analysis a and examples

In [31]:
import sys
sys.path.append('../common')
from utils import weight3 as weight_fn, weight_by_index
from utils import bag_weight, score, mean_n_sigma, score_stats
from utils import MAX_WEIGHT, AVAILABLE_GIFTS, GIFT_TYPES, N_TYPES, N_BAGS

from copy import deepcopy
from collections import defaultdict

import numpy as np
np.random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
N_TRIALS = 10000
GIFT_WEIGHTS = np.zeros((N_TRIALS, N_TYPES))
for index in range(N_TYPES):
    GIFT_WEIGHTS[:, index] = [weight_by_index(index) for i in range(10000)]

In [9]:
def find_n(weights):
    best_mean_score = 0
    best_n = 0
    for n in range(1, 500):
        mask = weights * n < MAX_WEIGHT
        if not mask.any():
            break
        score = np.sum(weights[mask] * n) * 1.0 / len(weights)  
        if score > best_mean_score:
            best_mean_score = score
            best_n = n
            
    return best_n, best_mean_score

LIMIT_NB_GIFTS = np.zeros((N_TYPES), dtype=np.uint8)

for index in range(N_TYPES):
    n, s = find_n(GIFT_WEIGHTS[:,index])
    print GIFT_TYPES[index], index, n, s
    LIMIT_NB_GIFTS[index] = n + 1

ball 0 20 37.4037848442
bike 1 2 20.6356442984
blocks 2 3 30.9060035842
book 3 13 14.8206683885
coal 4 1 23.440261052
doll 5 7 25.2166134843
gloves 6 50 17.6035621138
horse 7 6 27.4126121987
train 8 3 24.7003770899


In [10]:
LIMIT_NB_GIFTS, np.prod(LIMIT_NB_GIFTS)

(array([21,  3,  4, 14,  2,  8, 51,  7,  4], dtype=uint8), 80607744)

In [11]:
def compute_score(state):
    s = np.sum(GIFT_WEIGHTS * state, axis=1)
    mask = s < MAX_WEIGHT
    rejected = (N_TRIALS - np.sum(mask))*1.0 / N_TRIALS
    score = np.sum(s[mask]) * 1.0 / N_TRIALS
    return score, rejected


state = np.zeros((N_TYPES), dtype=np.uint8)
state[1] = 1
compute_score(state)

(20.115728164252364, 0.0011999999999999999)

In [12]:
def huge_loop2(rejected_threshold):
    scores = []
    for n0 in range(LIMIT_NB_GIFTS[0]):
        print "n0: ", n0, "/", LIMIT_NB_GIFTS[0]
        state = np.array([n0, 0, 0, 0, 0, 0, 0, 0, 0])
        s, r = compute_score(state)
        if r > rejected_threshold:
            break
        for n1 in range(LIMIT_NB_GIFTS[1]):
            state = np.array([n0, n1, 0, 0, 0, 0, 0, 0, 0])
            s, r = compute_score(state)
            if r > rejected_threshold:
                break
            for n2 in range(LIMIT_NB_GIFTS[2]):
                state = np.array([n0, n1, n2, 0, 0, 0, 0, 0, 0])
                s, r = compute_score(state)
                if r > rejected_threshold:
                    break
                for n3 in range(LIMIT_NB_GIFTS[3]):
                    state = np.array([n0, n1, n2, n3, 0, 0, 0, 0, 0])
                    s, r = compute_score(state)
                    if r > rejected_threshold:
                        break
                    for n4 in range(LIMIT_NB_GIFTS[4]):
                        state = np.array([n0, n1, n2, n3, n4, 0, 0, 0, 0])
                        s, r = compute_score(state)
                        if r > rejected_threshold:
                            break                        
                        for n5 in range(LIMIT_NB_GIFTS[5]):
                            state = np.array([n0, n1, n2, n3, n4, n5, 0, 0, 0])
                            s, r = compute_score(state)
                            if r > rejected_threshold:
                                break                        
                            for n6 in range(LIMIT_NB_GIFTS[6]):
                                state = np.array([n0, n1, n2, n3, n4, n5, n6, 0, 0])
                                s, r = compute_score(state)
                                if r > rejected_threshold:
                                    break                        
                                for n7 in range(LIMIT_NB_GIFTS[7]):
                                    state = np.array([n0, n1, n2, n3, n4, n5, n6, n7, 0])
                                    s, r = compute_score(state)
                                    if r > rejected_threshold:
                                        break                        
                                    for n8 in range(LIMIT_NB_GIFTS[8]):
                                        state = np.array([n0, n1, n2, n3, n4, n5, n6, n7, n8])
                                        s, r = compute_score(state)
                                        if r > rejected_threshold:
                                            break                                    
                                        scores.append((state, s, r))
    return scores

In [13]:
import os
filename = "scores_r_075.npy"
if os.path.exists(filename):
    scores = np.load(filename)
else:
    scores = huge_loop2(0.75)
    scores = np.array(scores)
    # Save result
    np.save("scores_r_075.npy", scores)

In [14]:
alpha = 0.745
goal_weight = MAX_WEIGHT * alpha
print goal_weight

mask = scores[:, 1] > goal_weight
len(scores[mask]), scores[mask][:10, :]

37.25


(95, array([[array([6, 0, 1, 0, 0, 1, 1, 2, 0]), 37.369679272344804,
         0.053100000000000001],
        [array([7, 0, 1, 0, 0, 1, 0, 2, 0]), 37.527485719297054, 0.0613],
        [array([7, 0, 1, 1, 0, 1, 1, 1, 0]), 37.692940257865537, 0.0275],
        [array([7, 0, 1, 1, 0, 1, 2, 1, 0]), 37.344681205583392,
         0.060299999999999999],
        [array([8, 0, 1, 0, 0, 0, 1, 2, 0]), 37.552428587718268,
         0.030200000000000001],
        [array([8, 0, 1, 0, 0, 1, 1, 1, 0]), 38.087460032735109,
         0.020199999999999999],
        [array([8, 0, 1, 0, 0, 1, 2, 1, 0]), 37.604165717166701,
         0.055899999999999998],
        [array([8, 0, 1, 1, 0, 0, 0, 2, 0]), 37.423255074748297,
         0.043799999999999999],
        [array([8, 0, 1, 1, 0, 1, 0, 1, 0]), 37.992242875631888, 0.0332],
        [array([8, 0, 1, 1, 0, 1, 1, 1, 0]), 37.998889924227164,
         0.059499999999999997]], dtype=object))

In [17]:
def has_min_nb_gifts(state):
    return np.sum(state) >= 3 

def is_available(state, available_gifts, gift_types=GIFT_TYPES):
    for v, gift_type in zip(state, gift_types):
        if available_gifts[gift_type] - v < 0:
            return False
    return True

def update_available_gifts(available_gifts, state, gift_types=GIFT_TYPES):
    for v, gift_type in zip(state, gift_types):
        assert available_gifts[gift_type] - v >= 0, "Found state is not available : {}, {}".format(state, available_gifts)
        available_gifts[gift_type] = available_gifts[gift_type] - v

        
def fill_bags(sorted_scores, available_gifts):
    filled_bags = np.zeros((N_BAGS, N_TYPES), dtype=np.uint8)    
    last_score_computation = -1
    good_bag_index = 0
    bag_index = 0
    while bag_index < N_BAGS and good_bag_index < len(sorted_scores):
    
        state = sorted_scores[good_bag_index, 0]
        if is_available(state, available_gifts) and has_min_nb_gifts(state):
            # print "bag index : ", bag_index
            update_available_gifts(available_gifts, state, GIFT_TYPES)
            filled_bags[bag_index, :] = state
            bag_index += 1
        else:
            good_bag_index += 1
                
        if bag_index > 0 and (bag_index % 100) == 0 and last_score_computation < bag_index:
            s, r = score(filled_bags, return_rejected=True)
            print(bag_index, ">>> Current score: ", s, s * N_BAGS *1.0 / bag_index, "rejected=", r)
            last_score_computation = bag_index

        if bag_index > 0 and (bag_index % 150) == 0 and last_score_computation < bag_index:
            print(bag_index, ">>> Currently available gifts : ", [(k, available_gifts[k]) for k in GIFT_TYPES])
            last_score_computation = bag_index
    
    print "good_bag_index", good_bag_index
    return filled_bags

In [18]:
sorted_scores = scores[scores[:,1].argsort()][::-1]

In [39]:
r = 100
print "\n\n--- Start bags filling with : r=", r
mask = sorted_scores[:, 2] < r
sorted_scores = sorted_scores[mask]
print "\n---", len(sorted_scores), sorted_scores[:5, :]
available_gifts = deepcopy(AVAILABLE_GIFTS)
print "\n--------------\n"
filled_bags = fill_bags(sorted_scores, available_gifts)
#s, r = score(filled_bags, return_rejected=True)



--- Start bags filling with : r= 100

--- 138663 [[array([12,  0,  1,  0,  0,  0,  0,  1,  0]) 38.704855041006837
  0.038300000000000001]
 [array([12,  0,  1,  0,  0,  1,  0,  0,  0]) 38.47525620909795
  0.042599999999999999]
 [array([11,  0,  1,  0,  0,  0,  1,  1,  0]) 38.463477823310129
  0.031600000000000003]
 [array([9, 0, 1, 0, 0, 1, 0, 1, 0]) 38.41559619952325 0.0253]
 [array([9, 0, 1, 0, 0, 1, 1, 1, 0]) 38.408758012451742 0.0521]]

--------------

(100, '>>> Current score: ', 3980.6521540790004, 39806.521540790003, 'rejected=', 1.125)
(150, '>>> Currently available gifts : ', [('ball', 0), ('bike', 500), ('blocks', 850), ('book', 1200), ('coal', 166), ('doll', 941), ('gloves', 141), ('horse', 792), ('train', 942)])
(200, '>>> Current score: ', 7540.1366229750265, 37700.68311487513, 'rejected=', 6.0300000000000002)
(300, '>>> Current score: ', 11132.185160981773, 37107.283869939245, 'rejected=', 10.17)
(400, '>>> Current score: ', 14664.215831522937, 36660.539578807344, 'rejec

In [40]:
score(filled_bags, return_rejected=True, return_std=True)

(34712.887270504885,
 58.494999999999997,
 340.08907408361586,
 6.9907063305505828)

In [None]:
Kaggle: 35143.27282

In [None]:
def to_submission(state, available_gifts, gift_types):
    n_gifts = [available_gifts[t] for t in gift_types]
    output = []
    for bag in state:
        o = []
        for index, count in enumerate(bag):   
            gift_type = gift_types[index]
            for i in range(count):
                v = n_gifts[index] - 1
                assert v >= 0, "Gift index is negative"
                o.append(gift_type + '_%i' % v)
                n_gifts[index] -= 1
        output.append(o)  
    return output

def write_submission(state, filename):
    with open(filename, 'w') as w:
        w.write("Gifts\n")
        for bag in state:
            w.write(' '.join(bag) + '\n')
    


In [828]:
from datetime import datetime
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'  
        
submission = to_submission(filled_bags, AVAILABLE_GIFTS, GIFT_TYPES)    
write_submission(submission, submission_file)        

### Some results 

- submission_2017-01-27-15-51.csv
- 1000 max scores
- Local : 34722.394251084297
- Kaggle: 35143.27282


- submission_2017-01-27-16-00.csv
- 1000 max scores
- rejected = 0.075
- Local : 31405.637035241478
- Kaggle : 31645.38768


- submission_2017-01-27-16-36.csv
- 900 max scores
- n_gloves = 200
- Local : 34083
- Kaggle : 34040.37733


- submission_2017-01-28-01-42.csv
- 975 max scores
- n_gloves = 30
- Local : 34604.652758133743
- Kaggle : 35044.53273


- submission_2017-01-28-14-00.csv
- Sort decreasing 'rejected', 0.25
- Local : 31688
- Kaggle : 32222.66426


- submission_2017-01-28-14-52.csv
- Sort decreasing 'rejected', 'score': rejected=0.17
- Local : 33820.373412003297, 51.950000000000003
- Kaggle : 


- submission_2017-01-29-13-22.csv
- Sort decreasing 'pr_wins', source 'scores' with rejected=0.25
- Local : 33963.264414211175, 49.600000000000001
- Kaggle : 34122.79829

