In [4]:
!pip install xgboost

import pandas as pd
import numpy as np
import xgboost as xgb

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/8f/15/606f81a2b8a8e82eaa10683cb3f3074905ec65d3bcef949e3f0909f165a5/xgboost-0.80-py2.py3-none-manylinux1_x86_64.whl (15.8MB)
[K    100% |████████████████████████████████| 15.8MB 3.2MB/s eta 0:00:01
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
Installing collected packages: xgboost
Successfully installed xgboost-0.80
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [5]:
%%time

np.random.seed(2018)

trn = pd.read_csv('train_ver2.csv', low_memory=False)
tst = pd.read_csv('test_ver2.csv', low_memory=False)

CPU times: user 1min 17s, sys: 11.3 s, total: 1min 29s
Wall time: 2min 38s


In [6]:
%%time

prods = trn.columns[24:].tolist()

trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

for col in trn.columns[24:]:
    tst[col] = 0
    
df = pd.concat([trn, tst], axis=0)

features = []

categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext',
                   'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']

for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols


CPU times: user 26.5 s, sys: 23 s, total: 49.5 s
Wall time: 49.5 s


In [7]:
%%time

df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)


CPU times: user 3.35 s, sys: 16 ms, total: 3.37 s
Wall time: 3.36 s


In [8]:
%%time
df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)


CPU times: user 4.04 s, sys: 532 ms, total: 4.58 s
Wall time: 4.57 s


In [9]:
%%time
df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)


CPU times: user 1.57 s, sys: 48 ms, total: 1.62 s
Wall time: 1.61 s


In [10]:
%%time

features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'indrel_1mes', 'ind_actividad_cliente']


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [11]:
%%time
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']


CPU times: user 17.3 s, sys: 696 ms, total: 18 s
Wall time: 18 s


In [12]:
%%time
df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']


CPU times: user 6.02 s, sys: 492 ms, total: 6.51 s
Wall time: 6.51 s


In [13]:
%%time
df.fillna(-99, inplace=True)

def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers','int_date'] else col for col in df.columns ]
df_lag['int_date'] += 1

df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')

del df, df_lag
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
    
df_trn.fillna(-99, inplace=True)

features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]



CPU times: user 1min, sys: 37.7 s, total: 1min 38s
Wall time: 1min 38s


In [14]:
%%time
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']

del df_trn

X=[]
Y=[]

for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
    
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y


vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

## - TESTING


CPU times: user 3.85 s, sys: 2 s, total: 5.84 s
Wall time: 5.84 s


In [15]:
%%time
## XXGBoost model training


param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
}

X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])

dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

CPU times: user 124 ms, sys: 144 ms, total: 268 ms
Wall time: 266 ms


In [16]:
X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])

dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

In [17]:
%%time
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

import pickle


[0]	train-mlogloss:2.73433	eval-mlogloss:2.74233
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:2.48344	eval-mlogloss:2.49555
[2]	train-mlogloss:2.30469	eval-mlogloss:2.31939
[3]	train-mlogloss:2.15884	eval-mlogloss:2.17532
[4]	train-mlogloss:2.03811	eval-mlogloss:2.05536
[5]	train-mlogloss:1.9436	eval-mlogloss:1.96203
[6]	train-mlogloss:1.86333	eval-mlogloss:1.88254
[7]	train-mlogloss:1.7903	eval-mlogloss:1.81012
[8]	train-mlogloss:1.73062	eval-mlogloss:1.75126
[9]	train-mlogloss:1.67667	eval-mlogloss:1.69764
[10]	train-mlogloss:1.62686	eval-mlogloss:1.64816
[11]	train-mlogloss:1.58347	eval-mlogloss:1.60491
[12]	train-mlogloss:1.54526	eval-mlogloss:1.56676
[13]	train-mlogloss:1.5097	eval-mlogloss:1.53164
[14]	train-mlogloss:1.47852	eval-mlogloss:1.50091
[15]	train-mlogloss:1.4491	eval-mlogloss:1.47187
[16]	train-mlogloss:1.42437	eval-mlogloss:1.44772
[17]	train-ml

[161]	train-mlogloss:1.00085	eval-mlogloss:1.08825
[162]	train-mlogloss:1.00026	eval-mlogloss:1.0882
[163]	train-mlogloss:0.999756	eval-mlogloss:1.08825
[164]	train-mlogloss:0.999207	eval-mlogloss:1.0882
[165]	train-mlogloss:0.998629	eval-mlogloss:1.08815
[166]	train-mlogloss:0.998034	eval-mlogloss:1.08812
[167]	train-mlogloss:0.997484	eval-mlogloss:1.08807
[168]	train-mlogloss:0.996733	eval-mlogloss:1.08804
[169]	train-mlogloss:0.996099	eval-mlogloss:1.08798
[170]	train-mlogloss:0.995525	eval-mlogloss:1.08791
[171]	train-mlogloss:0.995027	eval-mlogloss:1.08789
[172]	train-mlogloss:0.994355	eval-mlogloss:1.08792
[173]	train-mlogloss:0.993913	eval-mlogloss:1.08795
[174]	train-mlogloss:0.993255	eval-mlogloss:1.0879
[175]	train-mlogloss:0.992711	eval-mlogloss:1.08793
[176]	train-mlogloss:0.992209	eval-mlogloss:1.08791
[177]	train-mlogloss:0.991645	eval-mlogloss:1.08794
[178]	train-mlogloss:0.991089	eval-mlogloss:1.08799
[179]	train-mlogloss:0.990435	eval-mlogloss:1.08796
[180]	train-mlogl

In [18]:
pickle.dump(model, open("xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

In [21]:
def apk(actual, predicted, k=7, default=0.0):
    # MAP@7
    if len(predicted) > k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / ( i + 1.0)
            
    if not actual:
        return default
    
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    return np.mean([apk(a,p,k,default) for a,p in zip(actual, predicted)])

In [23]:
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld.as_matrix(columns=['ncodpers'])

for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    
    vld[padd] = vld[prod] - vld[prev]

add_vld = vld.as_matrix(columns=[prod + '_add' for  prod in prods])
add_vld_list = [list() for i in range(len(ncodpers_vld))]

count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1
            
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

X_vld = vld.as_matrix(columns = features)
Y_vld = vld.as_matrix(columns=['y'])

dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)

preds_vld = preds_vld - vld.as_matrix(columns=[prod + '_prev' for prod in prods])

result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])
    
print(mapk(add_vld_list, result_vld, 7,0.0))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.04266379915553903
0.036435336167892435


In [None]:
X_all = XY.as_matrix(columns=features)
Y_all = XY.as_matrix(columns=['y'])
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]

best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))

model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

print("Feature importance:")
for kv in sorted([(k, v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)
    
X_tst = tst.as_matrix(columns=feature)
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
preds_tst = preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])


sumit_file = open('xgb.baseline.2015-06-28', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,pmip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse = True)[:7]
    y_prods = [p for y,p, ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

[0]	train-mlogloss:2.70996
[1]	train-mlogloss:2.45557
[2]	train-mlogloss:2.28324
[3]	train-mlogloss:2.14185
[4]	train-mlogloss:2.02824
[5]	train-mlogloss:1.93225
[6]	train-mlogloss:1.85379
[7]	train-mlogloss:1.7844
[8]	train-mlogloss:1.72256
[9]	train-mlogloss:1.66696
[10]	train-mlogloss:1.61826
[11]	train-mlogloss:1.57553
[12]	train-mlogloss:1.5374
[13]	train-mlogloss:1.50176
[14]	train-mlogloss:1.47008
[15]	train-mlogloss:1.44189
[16]	train-mlogloss:1.41677
[17]	train-mlogloss:1.39205
