In [1]:
"""
__author__ = 'ZFTurbo: https://kaggle.com/zfturbo'
Fork of ZFTurbo 'Mass hashes' code : https://www.kaggle.com/zfturbo/santander-product-recommendation/mass-hashes/code

Added personal recommendations based on previous user's choices

"""
import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

from datetime import datetime
from operator import itemgetter

from copy import deepcopy
import numpy as np

# Project
from zfturbo_script_mass_hashes_personal_recommendations import read_data, get_profiles, \
            personal_recommendations_to_proba, \
            common_recommendations_to_proba, \
            get_target_labels, predict_score, process_row, \
            ZFTURBO_COMMON_WEIGHT    

### Define train/test files

In [2]:
train_filename = "../data/train_ver2_201601-201605.csv"
# test_filename = "../data/test_ver2.csv"
test_filename = None

### Compute recommendations from training data

In [4]:
logging.info('--- Run solution ---')
reader = open(train_filename, "r")
target_labels = get_target_labels(reader.readline())

# Read data and create recommendations structures

nb_months_validation = 2
(personal_recommendations_validation,
 common_recommendations_validation,
 product_stats_validation) = read_data(reader, 201603, nb_months_validation,
                                       process_row, get_profiles)

logging.debug("-- common_recommendations_validation : %s " % len(common_recommendations_validation))
logging.debug("-- personal_recommendations_validation : %s " % len(personal_recommendations_validation))
logging.debug("-- product_stats_validation : %s " % len(product_stats_validation))

personal_recommendations = deepcopy(personal_recommendations_validation)
common_recommendations = deepcopy(common_recommendations_validation)
product_stats = deepcopy(product_stats_validation)

(personal_recommendations,
 common_recommendations,
 product_stats,
 validation_data) = read_data(reader, 201605, 1,
                              process_row,
                              get_profiles,
                              return_raw_data=True,
                              personal_recommendations=personal_recommendations,
                              common_recommendations=common_recommendations,
                              product_stats=product_stats)

logging.debug("-- common_recommendations : %s " % len(common_recommendations))
logging.debug("-- personal_recommendations : %s " % len(personal_recommendations))
logging.debug("-- product_stats : %s " % len(product_stats))

reader.close()

personal_recommendations_to_proba(personal_recommendations, nb_months_validation)
personal_recommendations_to_proba(personal_recommendations_validation, nb_months_validation+1)

common_recommendations_to_proba(common_recommendations)
common_recommendations_to_proba(common_recommendations_validation)

# Sort product stats:
product_stats_validation = sorted(product_stats_validation.items(), key=itemgetter(1), reverse=True)
product_stats = sorted(product_stats.items(), key=itemgetter(1), reverse=True)

INFO:root:--- Run solution ---
INFO:root:- READ DATA : months to read ['2016-03', '2016-04']
INFO:root:-- Processed 100000 lines . Current month : 2016-03
INFO:root:-- Processed 200000 lines . Current month : 2016-03
INFO:root:-- Processed 300000 lines . Current month : 2016-03
INFO:root:-- Processed 400000 lines . Current month : 2016-03
INFO:root:-- Processed 500000 lines . Current month : 2016-03
INFO:root:-- Processed 600000 lines . Current month : 2016-03
INFO:root:-- Processed 700000 lines . Current month : 2016-03
INFO:root:-- Processed 800000 lines . Current month : 2016-03
INFO:root:-- Processed 900000 lines . Current month : 2016-03
INFO:root:-- Processed 1000000 lines . Current month : 2016-04
INFO:root:-- Processed 1100000 lines . Current month : 2016-04
INFO:root:-- Processed 1200000 lines . Current month : 2016-04
INFO:root:-- Processed 1300000 lines . Current month : 2016-04
INFO:root:-- Processed 1400000 lines . Current month : 2016-04
INFO:root:-- Processed 1500000 lin

### Search personal recommendations weight to maximize the score 

In [None]:
from scipy import optimize

In [None]:
# Search a minimum :
# f = lambda x: -predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      x)

# ret = optimize.fmin(f, 0.5, full_output=True, xtol=0.001, ftol=0.001)

In [None]:
# print "Found maximum score : ", ret[1], " at ", ret[0]
# map7 = ret[1]

In [5]:
map7 = predict_score(validation_data,
                     process_row,
                     get_profiles,
                     personal_recommendations_validation,
                     common_recommendations_validation,
                     product_stats_validation,
                     0.0)
print map7

DEBUG:root:-- predict_score : personal_recommendations_weight=0.0
DEBUG:root:--- predict_score : map7=0.0209900993714


0.0209900993714


In [None]:
# map7 = predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      0.1)
# print map7

In [None]:
# map7 = predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      0.2)
# print map7

In [None]:
# map7 = predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      0.3)
# print map7

In [None]:
# map7 = predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      0.7)
# print map7

In [None]:
from zfturbo_script_mass_hashes_personal_recommendations import compute_predictions, get_real_values, apk

def _predict_score(validation_data, get_profiles_func,
                  personal_recommendations,
                  common_recommendations,
                  product_stats,
                  personal_recommendations_weight):
    logging.debug("-- predict_score : personal_recommendations_weight=%s" % personal_recommendations_weight)
    map7 = 0.0   
    count = 25
    for i, row in enumerate(validation_data):
        predicted = compute_predictions(row, get_profiles_func,
                                        personal_recommendations,
                                        common_recommendations,
                                        product_stats,
                                        personal_recommendations_weight)

        real = get_real_values(row, personal_recommendations)
        score = apk(real, predicted)
        if count > 0:
            print "-- i : ", i, row[1], " score : ", score, " | predicted : ", predicted, ", real : ", real        
        map7 += score
        
        count -= 1
        if count == 0:
            break

    if len(validation_data) > 0:
        map7 /= len(validation_data)

    logging.debug("--- predict_score : map7=%s" % map7)
    return map7

In [None]:
# map7 = _predict_score(validation_data,
#                      get_profiles,
#                      personal_recommendations_validation,
#                      common_recommendations_validation,
#                      product_stats_validation,
#                      0.0)
# print map7

### Check ZFTurbo code

In [None]:
from common import get_user, apk, get_real_values, get_choices

def zfturbo_compute_predictions(row, get_profiles_func,
                        best,
                        personal_recommendations,
                        product_stats):
    predicted = []
    user = get_user(row)
    profiles = get_profiles_func(row)

    last_choice = None
    if user in personal_recommendations:
        last_choice = personal_recommendations[user]['last_choice']

    def _get_next_best_prediction(best, profiles, predicted, last_choice):
        import heapq
        score = [0] * 24
        for h in profiles:
            if h in best:
#                 print "-- profile : ", h
                for i in range(len(best[h])):
                    sc = 24 - i + len(h)
#                     print "-- i, sc", i, sc
                    index = best[h][i][0]
                    if last_choice is not None:
#                         print "--- ", index, last_choice[index] 
                        if last_choice[index] == 1:
                            continue
                    if index not in predicted:
                        score[index] += sc
        
#         print "\n -- score : ", score
        
        final = []
        pred = heapq.nlargest(7, range(len(score)), score.__getitem__)
#         print "\n -- pred : ", pred
        for i in range(7):
            if score[pred[i]] > 0:
                final.append(pred[i])
#         print "\n -- final : ", final
        return final

    predicted = _get_next_best_prediction(best, profiles, predicted, last_choice)

    # print "\n- PREDICTED : ", predicted
    # add suggestions from product_stats:
    if len(predicted) < 7:
        for product in product_stats:
            # If user is not new
            if last_choice is not None and last_choice[product[0]] == 1:
                continue

            if product[0] not in predicted:
                predicted.append(product[0])
                if len(predicted) == 7:
                    break

    # print "FINAL PREDICTED : ", predicted
    return predicted

In [None]:
def zfturbo_predict_score(validation_data, get_profiles_func,
                  common_recommendations,
                  personal_recommendations,
                  product_stats):
    
    logging.debug("-- zfturbo_predict_score")
    map7 = 0.0    
    count = 25
    for i, row in enumerate(validation_data):
        predicted = zfturbo_compute_predictions(row, get_profiles_func,
                                        common_recommendations,
                                        personal_recommendations,
                                        product_stats)
        real = get_real_values(row, personal_recommendations)
        score = apk(real, predicted)
        if count > 0:
            print "-- i : ", i, row[1], " score : ", score, " | predicted : ", predicted, ", real : ", real
        map7 += score
    
        count-=1
        if count == 0:
            break
        
    if len(validation_data) > 0:
        map7 /= len(validation_data)

    logging.debug("--- predict_score : map7=%s" % map7)
    return map7

In [None]:
import operator
def sort_common_recommendations(common_recommendations):
    out = dict()
    for b in common_recommendations:
        arr = common_recommendations[b]
        srtd = sorted(arr.items(), key=operator.itemgetter(1), reverse=True)
        # remove 'total'
        out[b] = [item for item in srtd if item[0] != 'total']
    return out
best_validation = sort_common_recommendations(common_recommendations_validation)

In [None]:
#best_validation.items()[:100]
# print common_recommendations_validation[(3, 'ES', 'H', 2, 'NA', 1530074)]
# print best_validation[(3, 'ES', 'H', 2, 'NA', 1530074)]

Compare common prediction methods:

In [None]:
# zfturbo_compute_predictions(validation_data[20], get_profiles,
#                                         best_validation,
#                                         product_stats_validation)

row = validation_data[115]
get_profiles_func = get_profiles


predicted = []
user = get_user(row)
profiles = get_profiles_func(row)

last_choice = None
if user in personal_recommendations_validation:
    last_choice = personal_recommendations_validation[user]['last_choice']

def _get_next_best_prediction(best, profiles, predicted, last_choice):
    import heapq
    score = [0] * 24
    for h in profiles:
        if h in best:
            #print "-- profile : ", h
            for i in range(len(best[h])):
                sc = 24 - i + len(h)
                #print "-- i, len(h), sc, index, proba : ", i, len(h), sc, " | ", best[h][i][0], " | ", best[h][i][1]
                index = best[h][i][0]
                if last_choice is not None:
                    #print "--- ", index, last_choice[index] 
                    if last_choice[index] == 1:
                        continue
                if index not in predicted:
                    score[index] += sc
            print "--> score : ", score

    print "\n -- score : ", score

    final = []
    pred = heapq.nlargest(7, range(len(score)), score.__getitem__)
    print "\n -- pred : ", pred
    for i in range(7):
        if score[pred[i]] > 0:
            final.append(pred[i])
    print "\n -- final : ", final
    return final

predicted = _get_next_best_prediction(best_validation, profiles, predicted, last_choice)

print "\n- PREDICTED : ", predicted
# add suggestions from product_stats:
if len(predicted) < 7:
    for product in product_stats:
        # If user is not new
        if last_choice is not None and last_choice[product[0]] == 1:
            continue

        if product[0] not in predicted:
            predicted.append(product[0])
            if len(predicted) == 7:
                break

print "FINAL PREDICTED : ", predicted

In [None]:
[23, 4, 12, 7, 22, 21, 8]

In [None]:
# predicted1 = compute_predictions(row, get_profiles_func,
#                                 _personal_recommendations,
#                                 _common_recommendations,
#                                 _product_stats,
#                                 personal_recommendations_weight)


row = validation_data[156]
get_profiles_func = get_profiles
_personal_recommendations = personal_recommendations_validation
_common_recommendations = common_recommendations_validation
_product_stats = product_stats_validation
personal_recommendations_weight = 0.5


predicted = []
user = get_user(row)
profiles = get_profiles_func(row)

last_choice = None
if user in _personal_recommendations:
    last_choice = _personal_recommendations[user]['last_choice']

target_weights = None
total_length = 0.0
total_count = 0
# compute a total length to of participating profiles to define profile weight
for profile in profiles:
    if profile in _common_recommendations:
        total_length += len(profile)
        total_count += 1

if total_length > 0:
    target_weights = np.zeros(24)

zfturbo_common_weight = 1.0
mine_common_weight = 1.0 - zfturbo_common_weight
    
for profile in profiles:
    if profile in _common_recommendations:
        profile_weight = len(profile) * 1.0 / total_length
        # _common_recommendations[profile].items() -> [(target, proba)]
        target_probas = sorted(_common_recommendations[profile].items(), key=itemgetter(1), reverse=True)        

        target_total_score = (24.0 + len(profile)) * total_count
        for i, target_proba in enumerate(target_probas):
            target_score = 24 - i + len(profile)
            target = target_proba[0]
            proba = target_proba[1]
            if isinstance(target, int):
                p1 =  _common_recommendations[profile][target] * profile_weight * mine_common_weight
                p2 = target_score * 1.0 / target_total_score * zfturbo_common_weight 
                target_weights[target] += p1 + p2 #_common_recommendations[profile][target] * profile_weight
        print "-> target_weights: ", target_weights[2], target_weights[23]

        
personal_predictions = None
if user in _personal_recommendations:
    personal_predictions = _personal_recommendations[user]['recommendations']        

print "Common : {}".format(target_weights)
print "Personal : {}".format(personal_predictions)
        
# print "\n\n target_weights : ", target_weights
suggestions = (1.0 - personal_recommendations_weight) * target_weights + personal_recommendations_weight * personal_predictions
print last_choice
if last_choice is not None:
    mask = np.abs(last_choice - 1)
    suggestions *= mask
    
print "\n\n Predictions : {}".format(suggestions)
# print "\n\n Common predictions : ", suggestions
    
print np.argsort(suggestions)[::-1].tolist()[:7]

In [None]:
get_profiles_func = get_profiles
_personal_recommendations = personal_recommendations_validation
_common_recommendations = common_recommendations_validation
_product_stats = product_stats_validation
personal_recommendations_weight = 0.0

compute_predictions(row, get_profiles_func,
                    _personal_recommendations,
                    _common_recommendations,
                    _product_stats,
                    0.0)

In [None]:
map7 = zfturbo_predict_score(validation_data, get_profiles,
                  best_validation,
                  personal_recommendations_validation,
                  product_stats_validation)

print map7

In [None]:
map7 = zfturbo_predict_score(validation_data, get_profiles,
                  best_validation,
                  personal_recommendations_validation,
                  product_stats_validation)

print map7

In [None]:
ZFTURBO_COMMON_WEIGHT

In [None]:
ZFTURBO_COMMON_WEIGHT = 0.0

In [None]:
# def _predict_score(validation_data, get_profiles_func,
#                   personal_recommendations,
#                   common_recommendations,
#                   product_stats,
#                   personal_recommendations_weight):

get_profiles_func = get_profiles
_personal_recommendations = personal_recommendations_validation
_common_recommendations = common_recommendations_validation
_product_stats = product_stats_validation
personal_recommendations_weight = 0.0

logging.debug("-- predict_score : personal_recommendations_weight=%s" % personal_recommendations_weight)
map7_1 = 0.0   
map7_2 = 0.0   
count = -1

for i, row in enumerate(validation_data):

    predicted1 = compute_predictions(row, get_profiles_func,
                                    _personal_recommendations,
                                    _common_recommendations,
                                    _product_stats,
                                    personal_recommendations_weight)

    predicted2 = zfturbo_compute_predictions(row, get_profiles_func,
                                        best_validation,
                                        _personal_recommendations,
                                        _product_stats)
    
    real = get_real_values(row, _personal_recommendations)
    score1 = apk(real, predicted1)
    score2 = apk(real, predicted2)
    if count > 0 and score1 != score2:
        print "-- i : ", i, row[1]
        print "--- p1 : ", score1, predicted1, real
        print "--- p2 : ", score2, predicted2
    map7_1 += score1
    map7_2 += score2
    
    count -= 1
    if count == 0:
        break

if len(validation_data) > 0:
    map7_1 /= len(validation_data)
    map7_2 /= len(validation_data)

print map7_1, map7_2

In [None]:
2.33026234774e-05

In [None]:
[7, 23, 12, 17, 4, 8, 18]

### Write a submission

In [None]:
if test_filename is None:
    return

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'
writer = open(submission_file, "w")
reader = open(test_filename, "r")

# skip header:
reader.readline()

write_submission(writer, reader, target_labels, get_profiles, personal_recommendations, common_recommendations, product_stats)

writer.close()
reader.close()