# Statistical analysis and examples

In [1]:
import sys
sys.path.append('../common')
from utils import weight3 as weight_fn, weight_by_index
from utils import bag_weight, score, mean_n_sigma, score_stats
from utils import MAX_WEIGHT, AVAILABLE_GIFTS, GIFT_TYPES, N_TYPES, N_BAGS

from copy import deepcopy
from collections import defaultdict

import numpy as np
np.random.seed(2017)
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
N_TRIALS = 10000
GIFT_WEIGHTS = np.zeros((N_TRIALS, N_TYPES))
for index in range(N_TYPES):
    GIFT_WEIGHTS[:, index] = [weight_by_index(index) for i in range(10000)]

In [3]:
def find_n(weights):
    best_mean_score = 0
    best_n = 0
    for n in range(1, 500):
        mask = weights * n < MAX_WEIGHT
        if not mask.any():
            break
        score = np.sum(weights[mask] * n) * 1.0 / len(weights)  
        if score > best_mean_score:
            best_mean_score = score
            best_n = n
            
    return best_n, best_mean_score

LIMIT_NB_GIFTS = np.zeros((N_TYPES), dtype=np.uint8)

for index in range(N_TYPES):
    n, s = find_n(GIFT_WEIGHTS[:,index])
    print GIFT_TYPES[index], index, n, s
    LIMIT_NB_GIFTS[index] = n + 1

ball 0 20 37.6322158867
bike 1 2 20.7807671664
blocks 2 3 31.0021348597
book 3 13 14.7958669438
coal 4 1 23.2965570426
doll 5 6 25.2286629621
gloves 6 50 17.5340524804
horse 7 6 27.3275000842
train 8 3 25.2152724369


In [4]:
LIMIT_NB_GIFTS, np.prod(LIMIT_NB_GIFTS)

(array([21,  3,  4, 14,  2,  7, 51,  7,  4], dtype=uint8), 70531776)

In [5]:
def compute_score(state):
    s = np.sum(GIFT_WEIGHTS * state, axis=1)
    mask = s < MAX_WEIGHT
    rejected = (N_TRIALS - np.sum(mask))*1.0 / N_TRIALS
    score = np.sum(s[mask]) * 1.0 / N_TRIALS
    return score, rejected

state = np.zeros((N_TYPES), dtype=np.uint8)
state[1] = 1
compute_score(state)

(19.958184702872448, 0.0016000000000000001)

In [6]:
alpha = 0.745
goal_weight = MAX_WEIGHT * alpha
print goal_weight

37.25


In [7]:
LIMIT_NB_GIFTS

array([21,  3,  4, 14,  2,  7, 51,  7,  4], dtype=uint8)

In [8]:
def huge_loop2(rejected_threshold):
    scores = []
    for n0 in range(LIMIT_NB_GIFTS[0]):
        print "n0: ", n0, "/", LIMIT_NB_GIFTS[0]
        state = np.array([n0, 0, 0, 0, 0, 0, 0, 0, 0])
        s, r = compute_score(state)
        if r > rejected_threshold:
            break
        for n1 in range(LIMIT_NB_GIFTS[1]):
            state = np.array([n0, n1, 0, 0, 0, 0, 0, 0, 0])
            s, r = compute_score(state)
            if r > rejected_threshold:
                break
            for n2 in range(LIMIT_NB_GIFTS[2]):
                state = np.array([n0, n1, n2, 0, 0, 0, 0, 0, 0])
                s, r = compute_score(state)
                if r > rejected_threshold:
                    break
                for n3 in range(LIMIT_NB_GIFTS[3]):
                    state = np.array([n0, n1, n2, n3, 0, 0, 0, 0, 0])
                    s, r = compute_score(state)
                    if r > rejected_threshold:
                        break
                    for n4 in range(LIMIT_NB_GIFTS[4]):
                        state = np.array([n0, n1, n2, n3, n4, 0, 0, 0, 0])
                        s, r = compute_score(state)
                        if r > rejected_threshold:
                            break                        
                        for n5 in range(LIMIT_NB_GIFTS[5]):
                            state = np.array([n0, n1, n2, n3, n4, n5, 0, 0, 0])
                            s, r = compute_score(state)
                            if r > rejected_threshold:
                                break                        
                            for n6 in range(LIMIT_NB_GIFTS[6]):
                                state = np.array([n0, n1, n2, n3, n4, n5, n6, 0, 0])
                                s, r = compute_score(state)
                                if r > rejected_threshold:
                                    break                        
                                for n7 in range(LIMIT_NB_GIFTS[7]):
                                    state = np.array([n0, n1, n2, n3, n4, n5, n6, n7, 0])
                                    s, r = compute_score(state)
                                    if r > rejected_threshold:
                                        break                        
                                    for n8 in range(LIMIT_NB_GIFTS[8]):
                                        state = np.array([n0, n1, n2, n3, n4, n5, n6, n7, n8])
                                        s, r = compute_score(state)
                                        if r > rejected_threshold:
                                            break                                    
                                        scores.append((state, s, r))
    return scores

In [9]:
import os
filename = "scores_r_075.npy"
if os.path.exists(filename):
    scores = np.load(filename)
else:
    scores = huge_loop2(0.75)
    scores = np.array(scores)
    # Save result
    np.save("scores_r_075.npy", scores)

In [10]:
# alpha = 0.745
# goal_weight = MAX_WEIGHT * alpha
# print goal_weight
# mask = scores[:, 1] > goal_weight
# # len(scores[mask]), scores[mask][:10, :]

In [11]:
def has_min_nb_gifts(state):
    return np.sum(state) >= 3 

def is_available(state, available_gifts, gift_types=GIFT_TYPES):
    for v, gift_type in zip(state, gift_types):
        if available_gifts[gift_type] - v < 0:
            return False
    return True

def update_available_gifts(available_gifts, state, gift_types=GIFT_TYPES):
    for v, gift_type in zip(state, gift_types):
        assert available_gifts[gift_type] - v >= 0, "Found state is not available : {}, {}".format(state, available_gifts)
        available_gifts[gift_type] = available_gifts[gift_type] - v

        
def fill_bags3(sorted_scores, available_gifts):
    filled_bags = np.zeros((N_BAGS, N_TYPES), dtype=np.uint8)    
    last_score_computation = -1
    good_bag_index = 0
    bag_index = 0
    
    stop_loop = len(sorted_scores)
    n_start = 0
    next_group = False
    
    state = sorted_scores[good_bag_index, 0]
    current_n = np.sum(state)
    while bag_index < N_BAGS and stop_loop > 0:
        
#         good_bag_index = np.random.randint(len(sorted_scores))        
        if is_available(state, available_gifts) and has_min_nb_gifts(state):
#             print "bag index : ", bag_index, stop_loop        
            update_available_gifts(available_gifts, state, GIFT_TYPES)
            filled_bags[bag_index, :] = state
            bag_index += 1
            stop_loop = len(sorted_scores)            
        else:
            stop_loop -= 1            
        
        good_bag_index += 1
        state = sorted_scores[good_bag_index, 0]
        n = np.sum(state)
        if n < current_n:
            good_bag_index = n_start
        

        if good_bag_index < len(sorted_scores)-1:
            good_bag_index += 1
        else:
            good_bag_index = n_start
                
        if bag_index > 0 and (bag_index % 100) == 0 and last_score_computation < bag_index:
            s, r = score(filled_bags, return_rejected=True)
            print(bag_index, ">>> Current score: ", s, s * N_BAGS *1.0 / bag_index, "rejected=", r)
            last_score_computation = bag_index

        if bag_index > 0 and (bag_index % 150) == 0 and last_score_computation < bag_index:
            print(bag_index, ">>> Currently available gifts : ", [(k, available_gifts[k]) for k in GIFT_TYPES])
            last_score_computation = bag_index
    
    print "good_bag_index", good_bag_index
    return filled_bags


def fill_bags(sorted_scores, available_gifts):
    filled_bags = np.zeros((N_BAGS, N_TYPES), dtype=np.uint8)    
    last_score_computation = -1
    good_bag_index = 0
    bag_index = 0
    
    while bag_index < N_BAGS and good_bag_index < len(sorted_scores):
    
        state = sorted_scores[good_bag_index, 0]
        if is_available(state, available_gifts) and has_min_nb_gifts(state):
#             print "bag index : ", bag_index, stop_loop        
            update_available_gifts(available_gifts, state, GIFT_TYPES)
            filled_bags[bag_index, :] = state
            bag_index += 1
        else:
            good_bag_index += 1
                
        if bag_index > 0 and (bag_index % 100) == 0 and last_score_computation < bag_index:
            s, r = score(filled_bags, return_rejected=True)
            print(bag_index, ">>> Current score: ", s, s * N_BAGS *1.0 / bag_index, "rejected=", r)
            last_score_computation = bag_index

        if bag_index > 0 and (bag_index % 150) == 0 and last_score_computation < bag_index:
            print(bag_index, ">>> Currently available gifts : ", [(k, available_gifts[k]) for k in GIFT_TYPES])
            last_score_computation = bag_index
    
    print "good_bag_index", good_bag_index
    return filled_bags

In [12]:
import pandas as pd
df_scores = pd.DataFrame(data=scores, columns=['state', 'score', 'rejected'])
df_scores.head()

Unnamed: 0,state,score,rejected
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,0.0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 1]",9.96169,0.0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 2]",19.8436,0.0015
3,"[0, 0, 0, 0, 0, 0, 0, 0, 3]",24.8407,0.0882
4,"[0, 0, 0, 0, 0, 0, 0, 1, 0]",5.02266,0.0


In [14]:
df_scores.loc[:, 'score_int'] = df_scores['score'].apply(int)

In [28]:
df_scores.loc[:, 'r'] = df_scores['rejected'].apply(lambda x: int(x*100))

In [45]:
# df_scores.loc[:, 'n'] = df_scores['state'].apply(np.sum)

In [83]:
# def mean_std(state, count=100):
#     w = []
#     for c in range(count):
#         m = 0
#         for i, v in enumerate(state):
#             if v > 0:
#                 m += np.sum([weight_by_index(i) for j in range(v)])
#         w.append(m)
#     return np.mean(w), np.std(w)

# def compute_n_bags(state, available_gifts=AVAILABLE_GIFTS):
#     out = 1000
#     for i, v in enumerate(state):
#         if v > 0:
#             n = int(np.floor(available_gifts[GIFT_TYPES[i]] / v))
#             out = min(out, n)
#     return out
    
# # compute_n_bags(np.array([19, 0, 0, 0, 0, 0, 1, 0, 0]), AVAILABLE_GIFTS)
# df_scores.loc[:, 'n_bags'] = df_scores['state'].apply(compute_n_bags)

In [15]:
# df_scores.loc[:, 'mean_std'] = df_scores['state'].apply(mean_std)

In [86]:
# df_scores.loc[:, 'n_bags_x_score'] = df_scores['score'] * df_scores['n_bags']
# df_scores.loc[:, 'rejected_x_n'] = df_scores['rejected'] * df_scores['n'] * df_scores['n_bags'] / N_BAGS

In [29]:
df_sorted_scores = df_scores.sort_values(['r', 'score_int'], ascending=False)

In [325]:
# def to_remove(s):
#     a = np.array([1, 0, 1, 0, 0, 1, 1, 1, 0])
# #     if (s == a).all():
# #         return False    
#     res = a * s
#     return np.sum(res) != 0

In [17]:
# a = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0])
# b = np.array([0, 0, 1, 2, 0, 1, 0, 1, 1])
# # b = (b > 0).astype(np.int)
# res = a * b
# res, np.sum(res) == 0

In [43]:
s, n, r = 0.0, 1201, 0.15
# df = df_sorted_scores[(df_sorted_scores['n_bags'] < n) & (df_sorted_scores['rejected'] < r) & (df_sorted_scores['score'] > s)]
# df.loc[:, 'to_remove'] = df['state'].apply(to_remove)
# df = df[(df['to_remove'] == False)] 
df = df_sorted_scores[(df_sorted_scores['rejected'] < r)]
df.shape, df['score'].min(), df['score'].max(), df.head(20)

((17154, 5),
 0.0,
 38.704855041006837,
                                state    score rejected  score_int   r
 487999  [13, 0, 1, 0, 0, 0, 1, 1, 0]  36.3937   0.1455         36  14
 488077  [13, 0, 1, 0, 0, 1, 1, 0, 0]  36.2026   0.1489         36  14
 438718   [8, 0, 1, 0, 0, 0, 0, 1, 1]  35.1197   0.1415         35  14
 438982   [8, 0, 1, 0, 0, 1, 0, 0, 1]  35.1566   0.1404         35  14
 455271   [9, 0, 1, 1, 0, 2, 1, 0, 0]  35.3871   0.1433         35  14
 475961  [11, 0, 1, 0, 0, 0, 3, 1, 0]  35.0893   0.1459         35  14
 476224  [11, 0, 1, 1, 0, 0, 2, 1, 0]  35.8415   0.1436         35  14
 476327  [11, 0, 1, 1, 0, 1, 2, 0, 0]  35.7445    0.145         35  14
 482991  [12, 0, 1, 0, 0, 1, 2, 0, 0]  35.9639   0.1417         35  14
 485084  [13, 0, 0, 1, 0, 2, 0, 1, 0]  35.4829   0.1404         35  14
 488231  [13, 0, 1, 2, 0, 0, 1, 0, 0]  35.4455   0.1404         35  14
 489280  [14, 0, 0, 1, 0, 0, 2, 2, 0]  35.1364   0.1437         35  14
 496752  [17, 0, 0, 2, 0, 0, 0, 1, 0]

In [44]:
print "\n\n--- Start bags filling with : min_score=", s, "rejected=", r
# df = df_sorted_scores[df_sorted_scores['score'] > s]
print "\n---", df.shape, df['score'].min(), df['score'].max(), df.head()
available_gifts = deepcopy(AVAILABLE_GIFTS)
print "\n--------------\n"
filled_bags = fill_bags(df.as_matrix(), available_gifts)



--- Start bags filling with : min_score= 0.0 rejected= 0.15

--- (17154, 5) 0.0 38.704855041                                state    score rejected  score_int   r
487999  [13, 0, 1, 0, 0, 0, 1, 1, 0]  36.3937   0.1455         36  14
488077  [13, 0, 1, 0, 0, 1, 1, 0, 0]  36.2026   0.1489         36  14
438718   [8, 0, 1, 0, 0, 0, 0, 1, 1]  35.1197   0.1415         35  14
438982   [8, 0, 1, 0, 0, 1, 0, 0, 1]  35.1566   0.1404         35  14
455271   [9, 0, 1, 1, 0, 2, 1, 0, 0]  35.3871   0.1433         35  14

--------------

(100, '>>> Current score: ', 3911.632786916693, 39116.327869166933, 'rejected=', 8.6300000000000008)
(150, '>>> Currently available gifts : ', [('ball', 0), ('bike', 500), ('blocks', 850), ('book', 1146), ('coal', 166), ('doll', 908), ('gloves', 2), ('horse', 812), ('train', 934)])
(200, '>>> Current score: ', 7382.5777971091766, 36912.888985545884, 'rejected=', 19.66)
(300, '>>> Current score: ', 10890.960964941469, 36303.203216471564, 'rejected=', 30.18)
(400, '

In [45]:
print score(filled_bags, return_rejected=True), available_gifts

(32553.371954560582, 75.189999999999998) {'horse': 0, 'ball': 0, 'blocks': 1, 'doll': 0, 'train': 1, 'coal': 166, 'bike': 500, 'gloves': 0, 'book': 0}


In [34]:
print score(filled_bags, return_rejected=True), available_gifts

(33520.969648865925, 128.52000000000001) {'horse': 0, 'ball': 0, 'blocks': 1, 'doll': 0, 'train': 337, 'coal': 166, 'bike': 100, 'gloves': 0, 'book': 312}


In [229]:
def custom_fill_bags(sorted_scores, available_gifts):
    filled_bags = np.zeros((N_BAGS, N_TYPES), dtype=np.uint8)    
    last_score_computation = -1
    good_bag_index = 0
    bag_index = 0
    
#     print "-- stage 1"
#     while bag_index < N_BAGS-100:    
#         state = sorted_scores[0, 0]
#         if is_available(state, available_gifts) and has_min_nb_gifts(state):
# #             print "bag index : ", bag_index, stop_loop        
#             update_available_gifts(available_gifts, state, GIFT_TYPES)
#             filled_bags[bag_index, :] = state
#             bag_index += 1
                
#         if bag_index > 0 and (bag_index % 100) == 0 and last_score_computation < bag_index:
#             s, r = score(filled_bags, return_rejected=True)
#             print(bag_index, ">>> Current score: ", s, s * N_BAGS *1.0 / bag_index, "rejected=", r)
#             last_score_computation = bag_index

#         if bag_index > 0 and (bag_index % 150) == 0 and last_score_computation < bag_index:
#             print(bag_index, ">>> Currently available gifts : ", [(k, available_gifts[k]) for k in GIFT_TYPES])
#             last_score_computation = bag_index
    
    print "-- stage 2"
    good_bag_index = 1
    while bag_index < N_BAGS and good_bag_index < len(sorted_scores):
    
        state = sorted_scores[good_bag_index, 0]
        if is_available(state, available_gifts) and has_min_nb_gifts(state):
#             print "bag index : ", bag_index, stop_loop        
            update_available_gifts(available_gifts, state, GIFT_TYPES)
            filled_bags[bag_index, :] = state
            bag_index += 1
        else:
            good_bag_index += 1
                
        if bag_index > 0 and (bag_index % 100) == 0 and last_score_computation < bag_index:
            s, r = score(filled_bags, return_rejected=True)
            print(bag_index, ">>> Current score: ", s, s * N_BAGS *1.0 / bag_index, "rejected=", r)
            last_score_computation = bag_index

        if bag_index > 0 and (bag_index % 150) == 0 and last_score_computation < bag_index:
            print(bag_index, ">>> Currently available gifts : ", [(k, available_gifts[k]) for k in GIFT_TYPES])
            last_score_computation = bag_index
    
    
    
    
    print "good_bag_index", good_bag_index
    return filled_bags

In [284]:
score(([0, 2, 0, 0, 0, 0, 1, 0, 0],), return_rejected=True)

(28.133028727270752, 0.25)

In [292]:
score(([0, 0, 0, 0, 1, 0, 2, 0, 0],), return_rejected=True)

(23.03569290913374, 0.040000000000000001)

In [287]:
custom_states = np.array([
    [[1, 0, 1, 1, 0, 1, 0, 1, 1], 34.5248, 0.0223],
    [[0, 0, 0, 0, 1, 0, 2, 0, 0], 22.731160986729527, 0.05],
])

In [232]:
filled_bags

array([[0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 2, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [230]:
print "\n\n--- Start bags filling with : min_score=", s, "rejected=", r
# df = df_sorted_scores[df_sorted_scores['score'] > s]
print "\n---", custom_states.shape, custom_states[:10,:]
available_gifts = deepcopy(AVAILABLE_GIFTS)
print "\n--------------\n"
filled_bags = custom_fill_bags(custom_states, available_gifts)



--- Start bags filling with : min_score= 30 rejected= 0.5

--- (2, 3) [[[1, 0, 1, 1, 0, 1, 0, 1, 1] 34.5248 0.0223]
 [[0, 0, 0, 0, 1, 0, 2, 0, 0] 22.731160986729527 0.05]]

--------------

-- stage 2
(100, '>>> Current score: ', 2350.8876627271939, 23508.87662727194, 'rejected=', 5.5199999999999996)
good_bag_index 2


In [15]:
df_sorted_scores = df_scores.sort_values(['score'], ascending=False)

In [17]:
print "\n\n--- Start bags filling with : "
# df = df_sorted_scores[df_sorted_scores['score'] > s]
print "\n---", 
available_gifts = deepcopy(AVAILABLE_GIFTS)
print "\n--------------\n"
filled_bags = fill_bags(df_sorted_scores.as_matrix(), available_gifts)

 

--- Start bags filling with : 

--- 
--------------

(100, '>>> Current score: ', 3987.3176082544755, 39873.176082544756, 'rejected=', 1.04)
(150, '>>> Currently available gifts : ', [('ball', 0), ('bike', 500), ('blocks', 850), ('book', 1200), ('coal', 166), ('doll', 941), ('gloves', 141), ('horse', 792), ('train', 942)])
(200, '>>> Current score: ', 7540.3574631145821, 37701.78731557291, 'rejected=', 6.0)
(300, '>>> Current score: ', 11113.099421386456, 37043.664737954852, 'rejected=', 11.0)
(400, '>>> Current score: ', 14647.710950296707, 36619.277375741767, 'rejected=', 16.68)
(450, '>>> Currently available gifts : ', [('ball', 0), ('bike', 500), ('blocks', 550), ('book', 1041), ('coal', 166), ('doll', 641), ('gloves', 0), ('horse', 192), ('train', 642)])
(500, '>>> Current score: ', 18216.310680597559, 36432.621361195117, 'rejected=', 22.68)
(600, '>>> Current score: ', 21765.786678096811, 36276.311130161354, 'rejected=', 28.449999999999999)
(700, '>>> Current score: ', 25364.1

In [30]:
print score(filled_bags, return_rejected=True)

(34718.508243033692, 58.640000000000001)


In [77]:
def another_score(bags):        
    out = []
    for j in range(10):
        s = 0
        for bag in bags:            
            start = np.random.randint(GIFT_WEIGHTS.shape[0])
            w = 0
            for i, v in enumerate(bag):
                if v > 0:
                    start = min(GIFT_WEIGHTS.shape[0]-v, start)
                    ws = GIFT_WEIGHTS[start:start+v,i]                    
                    w += np.sum(ws)
            s += w if w < MAX_WEIGHT else 0.0
        out.append(s)
    return np.mean(out) + np.std(out), np.mean(out), np.std(out)

In [82]:
print another_score(filled_bags)

(35236.316366666593, 34944.79676573613, 291.51960093046631)


In [40]:
def to_submission(state, available_gifts, gift_types):
    n_gifts = [available_gifts[t] for t in gift_types]
    output = []
    for bag in state:
        o = []
        for index, count in enumerate(bag):   
            gift_type = gift_types[index]
            for i in range(count):
                v = n_gifts[index] - 1
                assert v >= 0, "Gift index is negative"
                o.append(gift_type + '_%i' % v)
                n_gifts[index] -= 1
        output.append(o)  
    return output

def write_submission(state, filename):
    with open(filename, 'w') as w:
        w.write("Gifts\n")
        for bag in state:
            w.write(' '.join(bag) + '\n')
    


In [41]:
from datetime import datetime
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'  
        
submission = to_submission(filled_bags, AVAILABLE_GIFTS, GIFT_TYPES)    
write_submission(submission, submission_file)        

### Some results 

- submission_2017-01-29-13-59.csv
- random shuffle, s = 22
- Local : 33224.557601849941, 84.790000000000006
- Kaggle : 33534.45962
