# Decision trees tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [2]:
import sys
sys.path.append("../common")

from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF
from utils import to_yearmonth, TARGET_LABELS, TARGET_LABELS2
from utils import target_str_to_labels, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices

In [None]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [None]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201602, 201605]
# train_yearmonths_list = [201505]
#val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 1500
#train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)
train_df = load_trainval(train_yearmonths_list, train_nb_clients=train_nb_clients)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201601, 201602, 201604, 201605]


Display loaded data

In [None]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ.tolist()].head(10)

Useful structures

In [None]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [None]:
months_ym_map = {}
# months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
months = train_df['fecha_dato'].unique()
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
# val_months = val_df['fecha_dato'].unique()
    

### Train a model

In [None]:
from utils import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [None]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [None]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    next_year_prev_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month - 1]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    common_clients_mask2, common_clients_mask3 = get_common_clients(df2, common_clients_mask2 & next_year_month_mask, next_year_prev_month_mask, df2)
        
    c1 = df1[common_clients_mask1 & month_mask]['ncodpers'].values
    c2 = df2[common_clients_mask2 & next_year_month_mask]['ncodpers'].values
    c3 = df2[common_clients_mask3 & next_year_prev_month_mask]['ncodpers'].values
    assert (c1 == c2).all() and (c2 == c3).all(), "Problem with common clients" 
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            

    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_FRQ[0] in df2.columns:
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index
        
        # Add TARGET_LABELS_FRQ to X:
        target_labels_frq = df2[common_clients_mask3 & next_year_prev_month_mask][['ncodpers'] + TARGET_LABELS_FRQ.tolist()]
        assert (X['ncodpers'].values == target_labels_frq['ncodpers'].values).all(), "There is a problem in alignment"
        target_labels_frq = target_labels_frq[TARGET_LABELS_FRQ]
        target_labels_frq.columns = [c + '_prev' for c in TARGET_LABELS_FRQ]
        target_labels_frq.index = X.index
        X = pd.concat([X, target_labels_frq], axis=1)        
        
    else:
        Y = None
    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape
X.head(10)

In [None]:
print Y.shape
Y.head(10)

In [None]:
print clients_last_choice.shape
clients_last_choice.head(10)

## Another train/predict + CV implementation

### Input

- `X` : `[nb_samples, nb_features]` shaped pd.DataFrame
    - `features_masks_list` : `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
    
- `Y` : `[nb_samples, nb_labels]` shaped pd.DataFrame
    - `labels_masks_list` : `{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.

- `samples_masks_list` : `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 


- Set of models `models` : list of functions to create a model, e.g. `[create_RF, create_NN, create_GBT]`


### Training phase




In [None]:
samples_masks_list = [
   lambda x:  ~x['targets_diff'].isin([0]), 
]

TARGET_LABELS_FRQ_PREV = [c + '_prev' for c in TARGET_LABELS_FRQ]

features_masks_dict = {
#     'fm_all': None,
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist(),# + TARGET_LABELS_FRQ_PREV,
#     'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#     'fm2': target_features,
#     'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
#     'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
#     'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
#     'fm6': TARGET_LABELS_FRQ,
}

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

def create_RF(input_shape, output_shape):
    return RandomForestClassifier(n_estimators=25, max_depth=5)

def create_ET(input_shape, output_shape):
    return ExtraTreesClassifier(n_estimators=25, max_depth=5)

def create_GB(input_shape, output_shape):
    return GradientBoostingClassifier()

models_dict = {
    'rf': create_RF,
    'et': create_ET,
#     'gb': create_GB,
}

In [None]:
common_groups = [
#     [2, ],
   [2, 18],
   [18, 23],
#     [23, ],
#     [18, ],
#     [12, ],
#     [21, ],
#     [22, ],
    [3, 4, 7, 8],
#     [17, ],
]

def flatten(array):
    out = []
    for item in array:
        out += item
    return out

others = list(set(range(24)) - set(flatten(common_groups)))
NP_TARGET_LABELS = np.array(TARGET_LABELS)

for i, a in enumerate(TARGET_LABELS2):
    print i, a
    
s = set({})
labels_masks_dict = {}
for i, g in enumerate(common_groups):
    labels_masks_dict['lm_%i' % i] = NP_TARGET_LABELS[g]
    s |= set(g)
labels_masks_dict['lm_others'] = NP_TARGET_LABELS[others]
s |= set(others)

assert len(s) == len(TARGET_LABELS), "Sum is not equal 24, s=%i" % s
print labels_masks_dict

In [None]:
models_pipelines = {
    'gb' : [(None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) == 1]
}
models_pipelines

In [None]:
from sklearn.preprocessing import StandardScaler

def prepare_to_fit(X_train, Y_train):
    x_train = X_train.values
    x_train = StandardScaler().fit_transform(x_train)
    y_train = Y_train.values    
    return x_train, y_train

def prepare_to_test(X_val, Y_val=None):
    x_val = X_val.values
    x_val = StandardScaler().fit_transform(x_val)
    y_val = Y_val.values if Y_val is not None else None   
    return x_val, y_val

In [None]:
def train_all(X_train, Y_train, 
              samples_masks_list, 
              features_masks_dict, 
              labels_masks_dict,
              models_dict,
              **kwargs):
    """
    Method to train a set of estimators from `models_dict` 
    on the data obtained after applying all combinations of 
    - samples mask from `samples_masks_list`,
    - features mask from `features_masks_dict` and 
    - labels mask from `labels_masks_dict`
    
    :X_train: a pd.DataFrame of training dataset containing features, `(nb_samples, nb_features)` 
    :Y_train: a pd.DataFrame of training dataset containing labels, `(nb_samples, nb_labels)`
    :samples_masks_list: a list, e.g. `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 
    If an empty list is providede, all samples are used for training

    :features_masks_list: a dictionary, e.g. `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
        Feature mask can be None to indicate all features.
    :labels_masks_list: a dictionary, e.g.`{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.
        Label mask can be None to indicate all labels.
    :models_dict: a dictionary of functions to create a model, e.g. `{'rf': create_RF, 'nn': create_NN, 'gbt': create_GBT}`

    In `kwargs` it is possible to define :
        :verbose: True/False
        :models_pipelines: (optional) a dictionary, e.g. `{model_name: [(feature_mask_name, label_mask_name), ...]}`. 
        It defines specific connection between a model and features/labels to train on. Useful, when a model can not train on 
        all types of labels. It is possible to specify only one mask name `feature_mask_name` or `label_mask_name` with None, e.g. (None, label_mask_name).
        If models_pipelines is defined and a model is not added into models_pipelines. It will be used on all combinations of feature mask/label mask.

    :return: a list of trained estimators, e.g. `[([features_mask_name, labels_mask_name, model_name], estimator_object, fit_accuracy), ...]` 
    """    
    logging.info("---------------")
    logging.info("-- Train all --")
    verbose = False if 'verbose' not in kwargs else kwargs['verbose']
    models_pipelines = None if 'models_pipelines' not in kwargs else kwargs['models_pipelines']
    
    if len(samples_masks_list) == 0:        
        samples_masks_list.append(lambda df: df.index.isin(df.index[:]))
    
    estimators = []
    
    for i, samples_mask in enumerate(samples_masks_list):
        mask = samples_mask(X_train)
        X_train_ = X_train[mask]
        Y_train_ = Y_train[mask]
        
        for features_mask_name in features_masks_dict:
            features_mask = features_masks_dict[features_mask_name]  
            X_train__ = X_train_[features_mask] if features_mask is not None else X_train_            
            for labels_mask_name in labels_masks_dict:
                labels_mask = labels_masks_dict[labels_mask_name] 
                Y_train__ = Y_train_[labels_mask] if labels_mask is not None else Y_train_                           
                logging.info("-- Process : sample_mask={}/{}, features_mask={}, labels_mask={}"
                             .format(len(X_train_), len(X_train), features_mask_name, labels_mask_name))
                x_train, y_train = prepare_to_fit(X_train__, Y_train__)
                logging.info("--- Train data shapes : {}, {}".format(x_train.shape, y_train.shape))                

                if y_train.shape[1] == 1:
                    # avoid DataConversionWarning
                    y_train = y_train.ravel()

                for model_name in models_dict:
                    logging.info("-- Create the model : %s" % model_name)
                    
                    can_fit = True
                    if models_pipelines is not None and model_name in models_pipelines:
                        can_fit = False
                        pipelines = models_pipelines[model_name]
                        # pipelines = [(feature_mask_name, label_mask_name), ...]
                        for _features_mask_name, _labels_mask_name in pipelines:        
                            b1 = _features_mask_name is None
                            b2 = _labels_mask_name is None
                            assert not (b1 and b2), "Feature_mask_name and label_mask_name can not be both None"
                            if _features_mask_name is not None and _features_mask_name == features_mask_name:
                                b1 = True
                            if _labels_mask_name is not None and _labels_mask_name == labels_mask_name:
                                b2 = True
                            can_fit = b1 and b2
                            if can_fit:
                                break
                                        
                    if not can_fit:
                        continue
                    
                    estimator = models_dict[model_name](input_shape=x_train.shape, output_shape=y_train.shape)
                    logging.info("--- Fit the model")
                    estimator.fit(x_train, y_train)                    
                    acc = estimator.score(x_train, y_train)
                    logging.info("--- Score : fit accuracy : %f" % acc)
                    estimators.append(([features_mask_name, labels_mask_name, model_name], estimator, acc))
                
                    if verbose:                        
                        logging.info("\n\n\t -- Feature ranking : -- \n\n")
                        logging.info("--- Estimator : {}, {}, {}".format(features_mask_name, labels_mask_name, model_name))
                        importances = estimator.feature_importances_
                        indices = np.argsort(importances)[::-1]            
                        for f in range(len(features_mask)):                
                            logging.info("%d. feature %d '%s' (%f)" % (f + 1, indices[f], features_mask[indices[f]], importances[indices[f]]))                            
    return estimators

In [None]:
def probas_to_indices(Y_probas, **kwargs):
    mask = (~Y_probas.isnull()).any()
    all_columns = Y_probas.columns
    Y_probas = Y_probas[mask[mask].index]
    y_probas = Y_probas.as_matrix()
  
    threshold = 0.5 if 'threshold' not in kwargs else kwargs['threshold']
    n_highest = 7 if 'n_highest' not in kwargs else kwargs['n_highest']
    
    y_probas[y_probas < threshold] = 0.0
    predicted_added_products = np.argsort(y_probas, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    out = []
    index_map = np.where(all_columns.isin(mask[mask].index))[0]
    for i, t in enumerate(predicted_added_products):
        out.append([index_map[j] for j in t if y_probas[i, j] > 0.0])
    return np.array(out)

In [None]:
def merge_predictions(Y_probas, y_probas, labels_mask, mode='sum', **kwargs):    
    first_time = Y_probas[labels_mask].isnull().all().all()
    if mode == 'max':
        if not first_time:
            Y_probas.loc[:, labels_mask] = np.maximum(Y_probas.loc[:, labels_mask], y_probas)
        else:
            Y_probas.loc[:, labels_mask] = y_probas    
    elif mode == 'sum':
        if not first_time:
            Y_probas.loc[:, labels_mask] = Y_probas.loc[:, labels_mask] + y_probas
        else:
            Y_probas.loc[:, labels_mask] = y_probas    
    else:
        raise Exception("Existing data merge is not yet implemented")

    return Y_probas


In [None]:
def predict_all(estimators, X_val, features_masks_dict, labels_masks_dict, labels, **kwargs):
    """
    Method to compute predictions using `estmators` from a test dataset `X_val`
    
    :estimators: a list of object of type ([features_mask_name, labels_mask_name, model_name], estimator_object, fit_accuracy)
    :X_val: a pd.DataFrame of shape `(nb_samples, nb_features)`
    :features_masks_dict: a dictionary of features masks (see train_all method)
    :labels_masks_dict: a dictionary of labels masks (see train_all method)
    :labels: a list of all available labels for the output
    
    In `kwargs` it is possible to define :
        :transform_proba_func: a function to transform computed probabilities into a custom form.
        Function signature should be `foo(Y_probas, **kwargs)`

        :verbose: True/False
    
    :return: 
        if `transform_proba_func` is not defined, predicted label probabilites `Y_probas` (pd.DataFrame) are returned.
        Thus, output is an ndarray of shape (nb_samples, len(labels)).
        
        if `transform_proba_func` is defined, then output is an ndarray of shape `(nb_samples, ...)`, the output of `transform_proba_func`.
    
    """
    logging.info("-----------------")
    logging.info("-- Predict all --")
    verbose = False if 'verbose' not in kwargs else kwargs['verbose']
    return_probas = False if 'return_probas' not in kwargs else kwargs['return_probas']
    transform_proba_func = None if 'transform_proba_func' not in kwargs else kwargs['transform_proba_func']
    
    Y_probas = pd.DataFrame(index=X_val.index, columns=labels)
    for estimator in estimators:
        # estimator is ([features_mask_name, labels_mask_name, model_name], estimator_object)
        features_mask_name, labels_mask_name, model_name = estimator[0]
        features_mask = features_masks_dict[features_mask_name]
        labels_mask = labels_masks_dict[labels_mask_name]
        logging.info("-- Process : model={}, features_mask={}, labels_mask={}".format(model_name, features_mask_name, labels_mask_name))
            
        x_val, _ = prepare_to_test(X_val[features_mask])
        logging.debug("--- Test data shapes : {}".format(x_val.shape))                
            
        y_probas = estimator[1].predict(x_val)
        logging.debug("--- Predicted data shape : {}".format(y_probas.shape))                
        if y_probas.dtype == np.int:
            y_probas = y_probas.astype(np.float)
        if len(y_probas.shape) == 1:
            y_probas = y_probas.reshape((y_probas.shape[0], 1))
        # multiply by accuracy : 
        y_probas *= estimator[2]
        Y_probas = merge_predictions(Y_probas, y_probas, labels_mask, **kwargs)
        
    if transform_proba_func is not None:
        if return_probas:
            return transform_proba_func(Y_probas, **kwargs), Y_probas
        else:
            return transform_proba_func(Y_probas, **kwargs)                
    return Y_probas


In [None]:
ll = 50000
mask = X.index.isin(X.index[:ll])

X1 = X[mask]
Y1 = Y[mask]
clc = clients_last_choice[mask]
print X1.shape, Y1.shape, clc.shape

mask = X.index.isin(X.index[ll:ll+ll//2])
X2 = X[mask]
Y2 = Y[mask]
clc2 = clients_last_choice[mask]
print X2.shape, Y2.shape, clc2.shape

In [None]:
_kwargs = {'samples_masks_list': samples_masks_list, 
            'features_masks_dict': features_masks_dict, 
            'labels_masks_dict': labels_masks_dict, 
            'models_dict': models_dict,
            'labels': TARGET_LABELS,
            'transform_proba_func': probas_to_indices,
            'threshold': 0.0,
            'n_highest': 7,
            'mode': 'sum',
            'verbose': False,
            'models_pipelines': models_pipelines,
            'return_probas': True
          }

In [None]:
estimators = train_all(X1, Y1, **_kwargs)

#print estimators

In [None]:
y_preds, Y_probas = predict_all(estimators, X2, **_kwargs)
#print y_preds[:5]

In [None]:
Y_probas.head()

In [None]:
y_val = targets_str_to_indices(Y2[TARGET_LABELS].values)

logging.info("- Compute max map7 score")
map7_score(y_val, y_val, clc2[LC_TARGET_LABELS].values)
logging.info("- Compute map7 score")
map7_score(y_val, y_preds, clc2[LC_TARGET_LABELS].values)


In [None]:
from utils import targets_to_labels, targets_indices_to_labels, remove_last_choice
from collections import defaultdict

In [None]:
limit = 100
count = 0

not_predicted_predicted = defaultdict(int)
for last_choice, targets, products, proba in zip(clc2[LC_TARGET_LABELS].values, y_val, y_preds, Y_probas.values):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
        
    if len(set(added_products) & set(predictions)) > 0:
#         print "Predicted : ", added_products, predictions
#         print set(added_products) & set(predictions)
        continue

    count += 1
    if count < limit:
        print "--- Count = ", count
        print targets_indices_to_labels(added_products, TARGET_LABELS2)#, targets_indices_to_labels(targets, TARGET_LABELS2)
        print targets_indices_to_labels(predictions, TARGET_LABELS2), targets_indices_to_labels(products, TARGET_LABELS2)#, proba
    
    for p in added_products:
        not_predicted_predicted[TARGET_LABELS2[p]] += 1
    

In [None]:
print not_predicted_predicted

In [None]:
print not_predicted_predicted

In [None]:
print y_probas[:10, target_groups[0]]
print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

### Run KFold Cross-validation 

In [None]:
# CROSS VALIDATION
from sklearn.model_selection import KFold
def cross_val_score2(data, 
                     nb_folds=5,
                     **kwargs):
    
    logging.info("- Cross validation : ")
    x_df, y_df, clients_last_choice = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    count = 0
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        count += 1
        logging.info("\n\n\t\t-- Fold : %i / %i\n" % (count, nb_folds))
        
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        clc_val = clients_last_choice[test_index, :]

        estimators = train_all(X_train, Y_train, **kwargs)
        if 'return_probas' in kwargs:
            y_preds, Y_probas = predict_all(estimators, X_val, **kwargs)
        else:
            y_preds = predict_all(estimators, X_val, **kwargs)

        y_val = targets_str_to_indices(Y_val[TARGET_LABELS].values)
        logging.info("- Compute map7 score")
        scores.append(map7_score(y_val, y_preds, clc_val))   
                            
    return np.array(scores)

In [None]:
nb_folds = 5
results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            nb_folds=nb_folds, **_kwargs)

print "Cross-Validation \n %i | %f | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), np.median(results), results.max(), results.std())



### 201505 -> 201605 

#### Single profiles:

Profiles   [0, 100] 
Cross-Validation (normalized)
 3 | 0.011683 | 0.012817 | 0.014958 | 0.00152 

Cross-Validation (not normalized)
 3 | 0.009244 | 0.010407 | 0.011922 | 0.00112 
 
 
Profiles :  [1, 101]
Cross-Validation (normalized)
 3 | 0.006793 | 0.009161 | 0.012219 | 0.00227 

Cross-Validation (not normalized)
 3 | 0.004787 | 0.009852 | 0.014922 | 0.00414


Profiles :  [112, 12]
Cross-Validation (normalized)
 3 | 0.008856 | 0.012124 | 0.016443 | 0.00318 

Cross-Validation (not normalized)
 3 | 0.007298 | 0.010140 | 0.014101 | 0.00289 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
estimators = train_all(X, Y, **_kwargs)

In [None]:
y_preds = predict_all(estimators, X, **_kwargs)

Check score on the data 2016/05

In [None]:
logging.info("- Compute map7 score")
print map7_score(y_val, y_preds, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_pred = predict_with_model(estimators, X, profiles, threshold=0.5)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    