In [38]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [39]:
np.random.seed(2019)

In [40]:
trn = pd.read_csv('../Dataset/train_ver2.csv/train_ver2.csv')
tst = pd.read_csv('../Dataset/test_ver2.csv/test_ver2.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


## 1. 데이터전처리

In [41]:
prods = trn.columns[24:].tolist()

trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

### 상품을 하나도 구매하지 않은 고객 데이터 제거

In [42]:
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

In [43]:
no_product.head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [44]:
trn.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0.0,6,1.0,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-28,1050611,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-28,1050612,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-28,1050613,N,ES,H,22,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-28,1050614,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
for col in trn.columns[24:]:
    tst[col] = 0
df = pd.concat([trn, tst], axis=0)

features = []

### 범주형 범수 label encoding

In [46]:
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols

In [47]:
[val for val in df['age'].unique().astype('str').tolist() if val.endswith('NA')]

[' NA']

In [48]:
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'indrel_1mes', 'ind_actividad_cliente']

## 2. 피처 엔지니어링

In [49]:
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']

df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

df.fillna(-99, inplace=True)

def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns]
df_lag['int_date'] += 1

df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how="left")

del df, df_lag

for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

## 3. 교차검증

### - 학습 : 2016-01-28 ~ 2016.02.28 / 검증 : 2016-05-28

In [50]:
use_dates = ['2016-01-28', '2016-02-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn

X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

In [51]:
XY.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_valo_fin_ult1_prev,ind_viv_fin_ult1_prev,ind_nomina_ult1_prev,ind_nom_pens_ult1_prev,ind_recibo_ult1_prev,fecha_alta_month_prev,fecha_alta_year_prev,ult_fec_cli_1t_month_prev,ult_fec_cli_1t_year_prev,y
10597872,2016-05-28,194160,0,0,0,42,2000-09-25,0.0,-68,1.0,...,0.0,0.0,0.0,0.0,0.0,9.0,2000.0,0.0,0.0,0
7658069,2016-01-28,1474324,0,0,1,43,2015-10-09,1.0,3,1.0,...,0.0,0.0,0.0,0.0,1.0,10.0,2015.0,0.0,0.0,1
7628180,2016-01-28,1432311,0,0,1,26,2015-08-07,1.0,5,1.0,...,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0,2
7628198,2016-01-28,1432232,0,0,1,33,2015-08-07,0.0,19,1.0,...,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0,2
7628482,2016-01-28,1432080,0,0,0,23,2015-08-07,1.0,5,1.0,...,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0,2


## 3. XGBoost  모델 훈련

In [52]:
param = {
    'booster' : 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'min_child_weight': 10,
    'colsmaple_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2019
}

In [53]:
len(features)

70

In [54]:
len(np.unique(features))

70

In [58]:
XY_trn.shape

(122825, 105)

In [59]:
XY_vld.shape

(37897, 105)

In [62]:
X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])
dtrn= xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)



  """Entry point for launching an IPython kernel.
  
  """
  


[0]	train-mlogloss:1.86337	eval-mlogloss:1.89301
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.66423	eval-mlogloss:1.69168
[2]	train-mlogloss:1.53238	eval-mlogloss:1.55829
[3]	train-mlogloss:1.43848	eval-mlogloss:1.46318
[4]	train-mlogloss:1.36804	eval-mlogloss:1.39287
[5]	train-mlogloss:1.31397	eval-mlogloss:1.33838
[6]	train-mlogloss:1.27168	eval-mlogloss:1.29697
[7]	train-mlogloss:1.23733	eval-mlogloss:1.26367
[8]	train-mlogloss:1.20965	eval-mlogloss:1.23646
[9]	train-mlogloss:1.18643	eval-mlogloss:1.21464
[10]	train-mlogloss:1.16743	eval-mlogloss:1.19714
[11]	train-mlogloss:1.15212	eval-mlogloss:1.18275
[12]	train-mlogloss:1.13871	eval-mlogloss:1.17056
[13]	train-mlogloss:1.12714	eval-mlogloss:1.16057
[14]	train-mlogloss:1.11700	eval-mlogloss:1.15249
[15]	train-mlogloss:1.10826	eval-mlogloss:1.14543
[16]	train-mlogloss:1.10075	eval-mlogloss:1.13964
[17]	trai

In [63]:
import pickle

pickle.dump(model, open("xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

## 4. 검증 MAP@7

In [70]:
def apk(actual, predicted, k=7, default=0.0):
    if len(predicted) > k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return default
    
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    return np.mean([apk(a, p, k, default) for a, p in zip(actual, predicted)])

In [72]:
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld.as_matrix(columns=['ncodpers'])

for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]
add_vld = vld.as_matrix(columns=[prod + '_add' for prod in prods])
add_vld_list = [list() for i in range(len(ncodpers_vld))]

count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1
            
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

X_vld = vld.as_matrix(columns=features)
Y_vld = vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)

preds_vld = preds_vld - vld.as_matrix(columns=[prod + '_prev' for prod in prods])

result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y, p, ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y, p, ip in y_prods])
    
print(mapk(add_vld_list, result_vld, 7, 0.0))

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
  


0.04266379915553903




0.036307454166184965


## 5. 테스트 데이터 예측

In [74]:
X_all = XY.as_matrix(columns=features)
Y_all = XY.as_matrix(columns=['y'])
dall = xgb.DMatrix(X_all, label=Y_all, feature_names = features)
watch_list = [(dall, 'train')]

best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))

model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

print("Feature importance:")
for kv in sorted([(k, v) for k, v in model.get_fscore().items()], key=lambda kv:kv[1], reverse=True):
    print(kv)
    




  """Entry point for launching an IPython kernel.
  


[0]	train-mlogloss:1.86308
[1]	train-mlogloss:1.66298
[2]	train-mlogloss:1.53061
[3]	train-mlogloss:1.43589
[4]	train-mlogloss:1.36584
[5]	train-mlogloss:1.31136
[6]	train-mlogloss:1.26909
[7]	train-mlogloss:1.23529
[8]	train-mlogloss:1.20787
[9]	train-mlogloss:1.18543
[10]	train-mlogloss:1.16689
[11]	train-mlogloss:1.15109
[12]	train-mlogloss:1.13837
[13]	train-mlogloss:1.12708
[14]	train-mlogloss:1.11743
[15]	train-mlogloss:1.10918
[16]	train-mlogloss:1.10147
[17]	train-mlogloss:1.09484
[18]	train-mlogloss:1.08935
[19]	train-mlogloss:1.08440
[20]	train-mlogloss:1.07962
[21]	train-mlogloss:1.07567
[22]	train-mlogloss:1.07188
[23]	train-mlogloss:1.06809
[24]	train-mlogloss:1.06505
[25]	train-mlogloss:1.06230
[26]	train-mlogloss:1.05959
[27]	train-mlogloss:1.05743
[28]	train-mlogloss:1.05455
[29]	train-mlogloss:1.05206
[30]	train-mlogloss:1.04965
[31]	train-mlogloss:1.04736
[32]	train-mlogloss:1.04536
[33]	train-mlogloss:1.04314
[34]	train-mlogloss:1.04090
[35]	train-mlogloss:1.03902
[3

  


TypeError: sequence item 0: expected str instance, tuple found

In [79]:
X_tst = tst.as_matrix(columns=features)
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
preds_tst= preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])

  """Entry point for launching an IPython kernel.
  after removing the cwd from sys.path.
  """


In [88]:
submit_file = open('./xgb.baseline.2015-06-28', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y, p, ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y, p, ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [98]:
with open('./xgb.baseline_predict', 'w') as f:
    f.write('ncodpers,added_products\n')
    for ncodper, pred in zip(ncodpers_tst, preds_tst):
        y_prods = [(y,p,ip) for y, p, ip in zip(pred, prods, range(len(prods)))]
        y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
        y_prods = [p for y, p, ip in y_prods]
        f.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [99]:
baseline = pd.read_csv('./xgb.baseline_predict')
baseline.shape

(929615, 2)