In [80]:
"""
__author__ = 'ZFTurbo: https://kaggle.com/zfturbo'
Fork of ZFTurbo 'Mass hashes' code : https://www.kaggle.com/zfturbo/santander-product-recommendation/mass-hashes/code

Added personal recommendations based on previous user's choices

"""
import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

from datetime import datetime
from operator import itemgetter

from copy import deepcopy
import numpy as np

# Project
from zfturbo_script_mass_hashes_personal_recommendations import read_data, \
            personal_recommendations_to_proba, \
            common_recommendations_to_proba, \
            get_target_labels, predict_score, process_row, \
            ZFTURBO_COMMON_WEIGHT, MINE_COMMON_WEIGHT    

### Define train/test files

In [3]:
train_filename = "../data/train_ver2_201601-201605.csv"
# test_filename = "../data/test_ver2.csv"
test_filename = None

### Compute recommendations from training data

In [12]:
def get_profiles(row):

    (fecha_dato, ncodpers, ind_empleado,  # 0
     pais_residencia, sexo, age,  # 3
     fecha_alta, ind_nuevo, antiguedad,  # 6
     indrel, ult_fec_cli_1t, indrel_1mes,  # 9
     tiprel_1mes, indresi, indext,  # 12
     conyuemp, canal_entrada, indfall,  # 15
     tipodom, cod_prov, nomprov,  # 18
     ind_actividad_cliente, renta, segmento) = row[:24]

    # if renta == '' or renta == 'NA':
    #     renta1 = '-1'
    # elif float(renta) < 45542.97:
    #     renta1 = '1'
    # elif float(renta) < 57629.67:
    #     renta1 = '2'
    # elif float(renta) < 68211.78:
    #     renta1 = '3'
    # elif float(renta) < 78852.39:
    #     renta1 = '4'
    # elif float(renta) < 90461.97:
    #     renta1 = '5'
    # elif float(renta) < 103855.23:
    #     renta1 = '6'
    # elif float(renta) < 120063.00:
    #     renta1 = '7'
    # elif float(renta) < 141347.49:
    #     renta1 = '8'
    # elif float(renta) < 173418.36:
    #     renta1 = '9'
    # elif float(renta) < 234687.12:
    #     renta1 = '10'
    # else:
    #     renta1 = '11'

    profiles = [
        (0, pais_residencia, nomprov, sexo, age, renta, segmento, ind_empleado),
        (1, pais_residencia, nomprov, renta, ind_empleado),
        (2, sexo, age, renta, segmento),
        (10, antiguedad, indrel_1mes, indrel, indresi, canal_entrada, ind_actividad_cliente, ind_nuevo),
        (11, antiguedad, indrel_1mes, indrel, indresi),
        (12, canal_entrada, ind_actividad_cliente, ind_nuevo),
        (100, sexo, age, renta, antiguedad, indrel, ind_actividad_cliente),

        ## ZFTurbo
        (1001, pais_residencia, sexo, age, ind_nuevo, segmento, ind_empleado, ind_actividad_cliente, indresi),
        (1002, pais_residencia, sexo, age, segmento, nomprov),
        (1003, pais_residencia, sexo, age, segmento, ncodpers),
        (1004, pais_residencia, sexo, age, segmento, antiguedad),
        (1005, pais_residencia, sexo, age, segmento, ind_nuevo),
        (1006, pais_residencia, sexo, age, segmento, ind_actividad_cliente),
        (1007, pais_residencia, sexo, age, segmento, canal_entrada),
        (1008, pais_residencia, sexo, age, segmento, ind_nuevo,canal_entrada),
        (1009, pais_residencia, sexo, age, segmento, ind_empleado),
        (10010, pais_residencia, sexo, renta, age, segmento),
        (10011, sexo, age, segmento)
    ]

    return profiles    

In [13]:
logging.info('--- Run solution ---')
reader = open(train_filename, "r")
target_labels = get_target_labels(reader.readline())

# Read data and create recommendations structures

nb_months_validation = 1
(personal_recommendations_validation,
 common_recommendations_validation,
 product_stats_validation) = read_data(reader, 201603, nb_months_validation,
                                       process_row, get_profiles)

logging.debug("-- common_recommendations_validation : %s " % len(common_recommendations_validation))
logging.debug("-- personal_recommendations_validation : %s " % len(personal_recommendations_validation))
logging.debug("-- product_stats_validation : %s " % len(product_stats_validation))


INFO:root:--- Run solution ---
INFO:root:- READ DATA : months to read ['2016-03']
DEBUG:root:--- Time analysis : mean row processing : 2.85895442963e-05, mean update : 5.1945567131e-05
INFO:root:-- Processed 100000 lines : Elapsed 29.8093628883 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 1.4491935865e-05, mean update : 2.75006662315e-05
INFO:root:-- Processed 200000 lines : Elapsed 9.70020580292 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 9.41018007881e-06, mean update : 2.48728063473e-05
INFO:root:-- Processed 300000 lines : Elapsed 11.5550940037 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 7.24330155203e-06, mean update : 1.56429614568e-05
INFO:root:-- Processed 400000 lines : Elapsed 10.4632530212 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 5.29151886969e-06, mean update : 1.10523107583e-05
INFO:root:-- Processed 500000 lines : Elapsed 9.3608

In [14]:
personal_recommendations_to_proba(personal_recommendations_validation, nb_months_validation)
common_recommendations_to_proba(common_recommendations_validation)
product_stats_validation = sorted(product_stats_validation.items(), key=itemgetter(1), reverse=True)

In [5]:
personal_recommendations = deepcopy(personal_recommendations_validation)
common_recommendations = deepcopy(common_recommendations_validation)
product_stats = deepcopy(product_stats_validation)

(personal_recommendations,
 common_recommendations,
 product_stats,
 validation_data) = read_data(reader, 201605, 1,
                              process_row,
                              get_profiles,
                              return_raw_data=True,
                              personal_recommendations=personal_recommendations,
                              common_recommendations=common_recommendations,
                              product_stats=product_stats)

logging.debug("-- common_recommendations : %s " % len(common_recommendations))
logging.debug("-- personal_recommendations : %s " % len(personal_recommendations))
logging.debug("-- product_stats : %s " % len(product_stats))


INFO:root:--- Run solution ---
INFO:root:- READ DATA : months to read ['2016-03', '2016-04']
DEBUG:root:--- Time analysis : mean row processing : 2.53415942192e-05, mean update : 1.73092675209e-05
INFO:root:-- Processed 100000 lines : Elapsed 23.2783670425 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 1.24912408475e-05, mean update : 8.87111854593e-06
INFO:root:-- Processed 200000 lines : Elapsed 5.34472799301 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 8.27553859716e-06, mean update : 6.83052238478e-06
INFO:root:-- Processed 300000 lines : Elapsed 5.60310220718 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 6.27681536086e-06, mean update : 5.12922325688e-06
INFO:root:-- Processed 400000 lines : Elapsed 5.63736987114 s. Current month : 2016-03
DEBUG:root:--- Time analysis : mean row processing : 5.0020987162e-06, mean update : 4.08363700558e-06
INFO:root:-- Processed 500000 lines : El

In [None]:
personal_recommendations_to_proba(personal_recommendations, nb_months_validation+1)
common_recommendations_to_proba(common_recommendations)
# Sort product stats:
product_stats = sorted(product_stats.items(), key=itemgetter(1), reverse=True)

In [9]:
reader.close()

### Compute the score 

In [72]:
def _get_profiles(row):

    (fecha_dato, ncodpers, ind_empleado,  # 0
     pais_residencia, sexo, age,  # 3
     fecha_alta, ind_nuevo, antiguedad,  # 6
     indrel, ult_fec_cli_1t, indrel_1mes,  # 9
     tiprel_1mes, indresi, indext,  # 12
     conyuemp, canal_entrada, indfall,  # 15
     tipodom, cod_prov, nomprov,  # 18
     ind_actividad_cliente, renta, segmento) = row[:24]

    profiles = [
#          (0, pais_residencia, nomprov, sexo, age, renta, segmento, ind_empleado),
#        (1, pais_residencia, nomprov, renta, ind_empleado),
#          (2, sexo, age, renta, segmento),
#         (10, antiguedad, indrel_1mes, indrel, indresi, canal_entrada, ind_actividad_cliente, ind_nuevo),
#         (11, antiguedad, indrel_1mes, indrel, indresi),
#          (12, canal_entrada, ind_actividad_cliente, ind_nuevo),
#         (100, sexo, age, renta, antiguedad, indrel, ind_actividad_cliente),

#         ## ZFTurbo
        (1001, pais_residencia, sexo, age, ind_nuevo, segmento, ind_empleado, ind_actividad_cliente, indresi),
        (1002, pais_residencia, sexo, age, segmento, nomprov),
        (1003, pais_residencia, sexo, age, segmento, ncodpers),
        (1004, pais_residencia, sexo, age, segmento, antiguedad),
        (1005, pais_residencia, sexo, age, segmento, ind_nuevo),
        (1006, pais_residencia, sexo, age, segmento, ind_actividad_cliente),
        (1007, pais_residencia, sexo, age, segmento, canal_entrada),
        (1008, pais_residencia, sexo, age, segmento, ind_nuevo,canal_entrada),
        (1009, pais_residencia, sexo, age, segmento, ind_empleado),
        (10010, pais_residencia, sexo, renta, age, segmento),
        (10011, sexo, age, segmento)
    ]

    return profiles    

In [81]:
#ZFTURBO_COMMON_WEIGHT = 0.5
#MINE_COMMON_WEIGHT = 1.0 - ZFTURBO_COMMON_WEIGHT

In [89]:
map7 = predict_score(validation_data[10000:20000],
                     process_row,
                     _get_profiles,
                     personal_recommendations_validation,
                     common_recommendations_validation,
                     product_stats_validation,
                     0.0)
print map7

DEBUG:root:-- predict_score : personal_recommendations_weight=0.0
DEBUG:root:--- predict_score : map7=0.0163794047619


0.0163794047619


In [None]:
0.0148373611111

ZFTurbo : 0.0148373611111
Mine : 0.0144725793651
Both : 0.0149396230159



0 : 0.0140291468254
1 : 0.013452281746
2 : 0.0147912698413

10 : 0.0121430555556
11 : 0.0125974404762
12 : 0.0143200992063

100 : 0.0127614484127

1001 : 0.0152238293651
1002 : 0.0145385119048
1003 : 0.00232928571429
1004 : 0.0128844246032
1005 : 0.0143448015873
1006 : 0.0151566865079
1007 : 0.0143549801587
1008 : 0.0144190277778
1009 : 0.0145933730159
1010 : 0.0146814484127
1011 : 0.0147400396825




ZFTurbo profiles : 
DEBUG:root:-- predict_score : personal_recommendations_weight=0.0
DEBUG:root:--- predict_score : map7=0.0209900993714

My profiles :
DEBUG:root:-- predict_score : personal_recommendations_weight=0.0
DEBUG:root:--- predict_score : map7=0.0197835926963

Both : 
DEBUG:root:-- predict_score : personal_recommendations_weight=0.0
DEBUG:root:--- predict_score : map7=0.0255465612777

0.0250303816124