In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import polars as pl
import gc
import polars as pl


np.random.seed(2018)

In [3]:
# データを呼び出します。
trn = pd.read_csv('../data/input/train_ver2.csv')
tst = pd.read_csv('../data/input/test_ver2.csv')

# trn = pl.read_csv('../data/input/train_ver2.csv', null_values=[' NA', 'NA','     NA', 'P'])
# tst = pl.read_csv('../data/input/test_ver2.csv', null_values=[' NA', 'NA','     NA'])

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
print(trn.shape)
print(tst.shape)

(13647309, 48)
(929615, 24)


In [5]:
# import numpy as np

def apk(actual, predicted, k=7, default=0.0):
    # AP@7なので、最大7個まで使用します。
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # 点数を付与する条件は次のとおり :
        # 予測値が正答に存在し (‘p in actual’)
        # 予測値に重複がなければ (‘p not in predicted[:i]’) 
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # 正答値が空白である場合、ともかく 0.0点を返します。
    if not actual:
        return default

    # 正答の個数(len(actual))として average precisionを求めます。
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    # list of listである正答値(actual)と予測値(predicted)から顧客別 Average Precisionを求め, np.mean()を通して平均を計算します。
    return np.mean([apk(a, p, k, default) for a, p in zip(actual, predicted)]) 

In [6]:
## データの前処理 ##

# 製品の変数を別途に保存しておきます。
prods = trn.columns[24:].tolist()

# 製品変数の欠損値をあらかじめ0に代替しておきます。
trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

# 24個の製品を1つも保有していない顧客のデータを除去します。
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]


## あとで消す
# メモリに乗らないのでサンプリング
trn = trn.sample(1000000)
print(trn.shape)

# 訓練データとテストデータを統合します。テストデータにない製品変数は0で埋めます。
for col in trn.columns[24:]:
    tst[col] = 0
df = pd.concat([trn, tst], axis=0)

(1000000, 48)


In [7]:
# 学習に使用する変数を入れるlistです。
features = []

# カテゴリ変数を .factorize() 関数に通して label encodingします。
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols

# 数値型変数の特異値と欠損値を -99に代替し、整数型に変換します。
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

# 学習に使用する数値型変数を featuresに追加します。
features += ['age','antiguedad','renta','ind_nuevo','indrel','indrel_1mes','ind_actividad_cliente']

# (特徴量エンジニアリング) 2つの日付変数から年度と月の情報を抽出します。
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']

df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

# それ以外の変数の欠損値をすべて -99に代替します。
df.fillna(-99, inplace=True)


In [8]:
# (特徴量エンジニアリング) lag-1 データを生成します。
# コード 2-12と類似したコードの流れです

# 日付を数字に変換する関数です。 2015-01-28は 1, 2016-06-28は 18に変換します。
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")] 
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

# 日付を数字に変換し int_dateに保存します。
df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

# データをコピーし, int_date 日付に1を加え lagを生成します。変数名に _prevを追加します。
df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns ]
df_lag['int_date'] += 1

In [9]:
# 原本データと lag データを ncodperと int_date を基準として合わせます。lag データの int_dateは 1 だけ押されているため、前の月の製品情報が挿入されます。
df_trn = df.merge(df_lag, on=['ncodpers','int_date'], how='left')

# メモリの効率化のために、不必要な変数をメモリから除去します。
del df, df_lag
gc.collect()

46

In [10]:
# 前の月の製品情報が存在しない場合に備えて、0に代替します。
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

# lag-1 変数を追加します。
features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

In [11]:
df_trn.head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_tjcr_fin_ult1_prev,ind_valo_fin_ult1_prev,ind_viv_fin_ult1_prev,ind_nomina_ult1_prev,ind_nom_pens_ult1_prev,ind_recibo_ult1_prev,fecha_alta_month_prev,fecha_alta_year_prev,ult_fec_cli_1t_month_prev,ult_fec_cli_1t_year_prev
0,2015-09-28,1082739,0,0,0,25,2012-10-17,0.0,35,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
1,2015-12-28,735499,0,0,0,44,2007-12-01,0.0,73,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
2,2015-07-28,1385732,0,0,0,27,2015-03-03,1.0,5,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
3,2016-04-28,1080490,0,0,1,26,2012-10-15,0.0,42,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
4,2015-04-28,1073250,0,0,0,22,2012-10-02,0.0,33,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
5,2015-07-28,459441,0,0,0,41,2004-02-14,0.0,-119,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
6,2016-05-28,1417088,0,0,0,20,2015-07-25,0.0,10,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
7,2015-02-28,818008,0,0,1,35,2008-11-19,0.0,80,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
8,2016-04-28,1506967,0,0,1,21,2015-11-19,1.0,5,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,-99.0,-99.0
9,2015-11-28,1331542,0,0,1,26,2014-10-13,0.0,13,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2014.0,0.0,0.0


In [12]:
###
### Baseline モデル以後、多様な特徴量エンジニアリングをここに追加します。
###


## モデル学習
# 学習のため、データを訓練、検証用に分離します。
# 学習には 2016-01-28 ~ 2016-04-28 のデータだけを使用し、検証には 2016-05-28 のデータを使用します。
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn



In [13]:
# 訓練データから新規購買件数だけを抽出します。
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

# 訓練、検証データに分離します。
vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

In [14]:
# XGBoost モデルの parameterを設定します。
param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    }

# 訓練、検証データを XGBoost 形態に変換します。
X_trn = XY_trn[features].values
Y_trn = XY_trn['y'].values
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

X_vld = XY_vld[features].values
Y_vld = XY_vld['y'].values
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

# XGBoost モデルを訓練データで学習させます！
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=10, evals=watch_list, early_stopping_rounds=20)

# 学習したモデルを保存します。
import pickle
pickle.dump(model, open("xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:2.78055	eval-mlogloss:2.79252
[1]	train-mlogloss:2.60405	eval-mlogloss:2.62112
[2]	train-mlogloss:2.47473	eval-mlogloss:2.49480
[3]	train-mlogloss:2.37417	eval-mlogloss:2.39668
[4]	train-mlogloss:2.29027	eval-mlogloss:2.31460
[5]	train-mlogloss:2.21904	eval-mlogloss:2.24500
[6]	train-mlogloss:2.15778	eval-mlogloss:2.18517
[7]	train-mlogloss:2.10410	eval-mlogloss:2.13279
[8]	train-mlogloss:2.05691	eval-mlogloss:2.08673
[9]	train-mlogloss:2.01527	eval-mlogloss:2.04621


In [15]:
# MAP@7 評価基準のための準備作業です。
# 顧客識別番号を抽出します。
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld['ncodpers'].values
# 検証データから新規購買を求めます。
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]    
add_vld = vld[[prod + '_add' for prod in prods]].values
add_vld_list = [list() for i in range(len(ncodpers_vld))]

# 顧客別新規購買正答値を add_vld_listに保存し、総 countを count_vldに保存します。
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


In [16]:
# 検証データから得ることのできる MAP@7 の最高点をあらかじめ求めておきます。(0.042663)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

0.9151528755351045


In [17]:
# 検証データに対する予測値を求めます。
X_vld = vld[features].values
# Y_vld = vld['y'].values
# dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
dvld = xgb.DMatrix(X_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)

# 前の月に保有していた商品は新規購買が不可能なので、確率値からあらかじめ1を引いておきます。
preds_vld = preds_vld - vld[[prod + '_prev' for prod in prods]].values



In [18]:
print(ncodpers_vld.shape)
ncodpers_vld

(62371,)


array([1417088,  289106, 1466296, ..., 1303736,  575408, 1400690])

In [19]:
print(preds_vld.shape)
preds_vld

(62371, 24)


array([[ 0.01961433,  0.019596  ,  0.54556876, ...,  0.01962773,
         0.01966964,  0.02018902],
       [ 0.02344646,  0.02342454,  0.190479  , ...,  0.02348462,
         0.02354543,  0.02390127],
       [ 0.01964287,  0.01962451,  0.54636246, ...,  0.01971952,
         0.01976213,  0.02046088],
       ...,
       [ 0.02217513,  0.0221544 ,  0.35454074, ...,  0.02221122,
         0.02240133,  0.02624249],
       [ 0.02030805,  0.02028907, -0.91057862, ...,  0.03747312,
         0.04041447,  0.2178639 ],
       [ 0.02420148,  0.02417886,  0.20082623, ...,  0.04796703,
         0.04985204,  0.11236265]])

In [20]:
# 検証データの予測上位7個を抽出します。
result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])

In [21]:
# 検証データの MAP@7の点数を求めます。(0.036466)
print(mapk(add_vld_list, result_vld, 7, 0.0))

0.776599666973404


In [74]:
# XGBoost モデルを全体の訓練データで学習します。
X_all = XY[features].values
Y_all = XY['y'].values
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]
# ツリーの個数を増加したデータの量に比例して増やします。
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))
# XGBoost モデル再学習！
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

# 変数の重要度を出力してみます。予想していた変数が上位に来ていますか？
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)

# Kaggleに提出するため、テストデータに対する予測値を求めます。
X_tst = tst[features].values
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst['ncodpers'].values
preds_tst = preds_tst - tst[[prod + '_prev' for prod in prods]].values



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:2.78286
[1]	train-mlogloss:2.60728
[2]	train-mlogloss:2.47849
[3]	train-mlogloss:2.37827
[4]	train-mlogloss:2.29468
[5]	train-mlogloss:2.22362
[6]	train-mlogloss:2.16257
[7]	train-mlogloss:2.10908
[8]	train-mlogloss:2.06201
[9]	train-mlogloss:2.02054
[10]	train-mlogloss:1.98320
[11]	train-mlogloss:1.95005
[12]	train-mlogloss:1.91980
[13]	train-mlogloss:1.89287
[14]	train-mlogloss:1.86801
Feature importance:
('antiguedad', 4995.0)
('age', 4864.0)
('renta', 3643.0)
('nomprov', 2781.0)
('canal_entrada', 2477.0)
('fecha_alta_month', 2435.0)
('fecha_alta_year', 2347.0)
('segmento', 1018.0)
('sexo', 861.0)
('tiprel_1mes', 620.0)
('ind_actividad_cliente', 566.0)
('in



In [None]:
make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.lightgbm.csv.gz" % str_date, "wb"), Y_test_lgbm - Y_prev, C)

In [57]:
def make_submission(f, Y_test, C):
    
    Y_ret = []
    
    # ファイルの最初の行にheaderを書き込みます。
    f.write("ncodpers,added_products\n".encode('utf-8'))
    # 顧客識別番号(C)と、予測結果(Y_test)の for loop
    for c, y_test in zip(C, Y_test):
        # (確率値、金融変数名、金融変数id)の tupleを求めます。
        # y_prods = [(y,p,ip) for y,p,ip in zip(y_test, products, range(len(products)))]
        y_prods = [[y,p,ip] for y,p,ip in zip(y_test, products, range(len(products)))]
        # 確率値をもとに、上位7個の結果だけを抽出します。
        y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
        # 金融変数idを Y_retに保存します。
        Y_ret.append([ip for y,p,ip in y_prods])
        y_prods = [p for y,p,ip in y_prods]
        # ファイルに “顧客識別番号、7個の金融変数”を書き込みます。
        f.write(("%s,%s\n" % (int(c), " ".join(y_prods))).encode('utf-8'))
    # 上位7個の予測値を返します。
    return Y_ret

In [65]:
ncodpers_tst.shape

(929615,)

In [66]:
preds_tst.shape

(929615, 24)

In [79]:
str_date = '20230227'

# 提出ファイルを生成します。
submit_file = open('xgb.baseline.2015-06-28', 'w')
# submit_file = open("%s.lightgbm.csv" % str_date, "wb")
submit_file.write('ncodpers,added_products\n')
# submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [82]:
# test_preds = []
# for row in sample.values:
#     id = row[0]
#     p = train_preds[id]
#     test_preds.append(' '.join(p))

# sample['added_products'] = test_preds
# sample.to_csv('collab_sub.csv', index=False)

# submit_file = pd.DataFrame()
# C_list = []
# test_preds = []
# for ncodper, pred in zip(ncodpers_tst, preds_tst):
#     y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
#     y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
#     y_prods = [p for y,p,ip in y_prods]
#     C_list.append(ncodper)
#     test_preds.append(' '.join(y_prods))
 
# submit_file['ncodpers'] = C_list
# submit_file['added_products'] = test_preds
# submit_file.to_csv('collab_sub.csv', index=False)
    
# submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [None]:
def make_submission(ncodpers_tst, preds_tst)

    submit_file = pd.DataFrame()
    C_list = []
    test_preds = []
    for ncodper, pred in zip(ncodpers_tst, preds_tst):
        y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
        y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
        y_prods = [p for y,p,ip in y_prods]
        C_list.append(ncodper)
        test_preds.append(' '.join(y_prods))

    submit_file['ncodpers'] = C_list
    submit_file['added_products'] = test_preds
    submit_file.to_csv('../output/collab_sub.csv', index=False)

In [83]:
submit_file.shape

(929615, 2)

In [29]:
products = prods.copy()

In [28]:
preds_tst.shape

(929615, 24)

In [None]:
zip((1,),(10,))

In [58]:
# ファイル圧縮用途
import gzip    
import pickle    
import zlib    

# products = prods.copy()

make_submission(gzip.open("20230226.xgboost-lightgbm.csv.gz", "wb"), ncodpers_tst, preds_tst)

['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']


TypeError: zip argument #1 must support iteration