In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm import tqdm
import xgboost as xgb

In [2]:
TRAIN_MONTH = '2015_05_28'
TEST_MONTH = '2016_05_28'

TRAIN_FILE = 'data/train_' + TRAIN_MONTH + '.csv'
ADDED_PRODUCTS_FILE = 'data/added_product_' + TRAIN_MONTH + '.csv'

TEST_FILE = 'data/train_' + TEST_MONTH + '.csv'

HEADER = ["fecha_dato", "ncodpers", "ind_empleado",
          "pais_residencia", "sexo", "age", "fecha_alta",
          "ind_nuevo", "antiguedad", "indrel", "ult_fec_cli_1t",
          "indrel_1mes", "tiprel_1mes", "indresi", "indext",
          "conyuemp", "canal_entrada", "indfall", "tipodom",
          "cod_prov", "nomprov", "ind_actividad_cliente",
          "renta", "segmento", "ind_ahor_fin_ult1",
          "ind_aval_fin_ult1", "ind_cco_fin_ult1",
          "ind_cder_fin_ult1", "ind_cno_fin_ult1",
          "ind_ctju_fin_ult1", "ind_ctma_fin_ult1",
          "ind_ctop_fin_ult1", "ind_ctpp_fin_ult1",
          "ind_deco_fin_ult1", "ind_deme_fin_ult1",
          "ind_dela_fin_ult1", "ind_ecue_fin_ult1",
          "ind_fond_fin_ult1", "ind_hip_fin_ult1",
          "ind_plan_fin_ult1", "ind_pres_fin_ult1",
          "ind_reca_fin_ult1", "ind_tjcr_fin_ult1",
          "ind_valo_fin_ult1", "ind_viv_fin_ult1",
          "ind_nomina_ult1", "ind_nom_pens_ult1",
          "ind_recibo_ult1"]

In [4]:
train = pd.read_csv(TRAIN_FILE, header=None, names=HEADER)
test = pd.read_csv(TEST_FILE, header=None, names=HEADER)
added_products = pd.read_csv(ADDED_PRODUCTS_FILE)

combined = pd.concat((train, test)).reset_index(drop=True)

In [5]:
# fixing age
combined['age'] = pd.to_numeric(combined['age'], errors='coerce')
# test['age'] = pd.to_numeric(test['age'], errors='coerce')

combined.loc[combined.age < 18, "age"] = combined.loc[(combined.age > 18) & (combined.age <= 30), "age"].mean(skipna=True)
combined.loc[combined.age > 100, "age"] = combined.loc[(combined.age > 30) & (combined.age <=100), "age"].mean(skipna=True)
combined['age'].fillna(combined['age'].mean(), inplace=True)
combined['age'] = combined['age'].astype(int)

# fix ind_nuevo.. 
combined.loc[combined.ind_nuevo.isnull(), 'ind_nuevo'] = 1

# fix antiguedad
combined['antiguedad'] = pd.to_numeric(combined['antiguedad'], errors='coerce')
combined.loc[combined.antiguedad.isnull(),'antiguedad'] = combined.antiguedad.min()
combined.loc[combined.antiguedad < 0, 'antiguedad'] = 0

# fix indrel
combined.loc[combined.indrel.isnull(), 'indrel'] = 1

# drop useless cols
combined.drop(['tipodom', 'cod_prov'], axis=1, inplace=True)

# fix ind_actividad_cliente
# combined.ind_actividad_cliente = pd.to_numeric(combined.ind_actividad_cliente, errors='coerce')
combined.loc[combined.ind_actividad_cliente.isnull(), "ind_actividad_cliente"] =\
    combined.ind_actividad_cliente.median()

# fix city name
combined.loc[combined.nomprov=="CORU\xc3\x91A, A","nomprov"] = "CORUNA, A"
combined.loc[combined.nomprov.isnull(), 'nomprov'] = 'UNKNOWN'

#fix incomes
# combined.renta = pd.to_numeric(combined.renta, errors='coerce')
grouped = combined.groupby('nomprov').agg({'renta': lambda x: x.median(skipna=True)}).reset_index()
new_incomes = pd.merge(combined, grouped, how='inner', on='nomprov').loc[:,['nomprov', 'renta_y']]

new_incomes = new_incomes.rename(columns={"renta_y":"renta"}).sort_values("renta").sort_values("nomprov")

combined.sort_values("nomprov", inplace=True)
combined = combined.reset_index()
new_incomes = new_incomes.reset_index()
combined.loc[combined.renta.isnull(), "renta"] = new_incomes.loc[combined.renta.isnull(), "renta"].median()
combined.sort_values(by='fecha_dato', inplace = True)

# rest of the columns
string_data = combined.select_dtypes(include=["object"])
missing_columns = [col for col in string_data if string_data[col].isnull().any()]
del string_data

combined.loc[combined.indfall.isnull(), 'indfall'] = 'N'
combined.loc[combined.tiprel_1mes.isnull(), 'tiprel_1mes'] = 'A'
combined.tiprel_1mes = combined.tiprel_1mes.astype('category')

map_dict = {
    '1.0': '1',
    '1': '1',
    '3.0': '3',
    'P': 'P',
    3.0: '3',
    2.0: '2',
    '3': '3',
    '2.0': '2',
    '4.0': '4',
    '4': '4',
    '2': '2',
    1.0: '1',
    4.0: '4'
}

combined.indrel_1mes.fillna('P', inplace=True)
combined.indrel_1mes = combined.indrel_1mes.apply(lambda x: map_dict[x])
combined.indrel_1mes = combined.indrel_1mes.astype('category')

unknown_cols = [col for col in missing_columns if col not in ['indfall', 'tiprel_1mes', 'indrel_1mes']]
for col in unknown_cols:
    combined.loc[combined[col].isnull(), col] = "UNKNOWN"

# feature cols
feature_cols = combined.iloc[:1,].filter(regex="ind_+.*ult.*").columns.values
for col in feature_cols:
    combined.loc[combined[col].isnull(), col] = 0
    combined[col] = combined[col].astype(int)

del combined['ult_fec_cli_1t'], combined['fecha_alta']

encoders = []
for col in ['sexo', 'indrel_1mes','pais_residencia', 'ind_empleado', 'segmento', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada','indfall', 'nomprov']:
    temp_enc = LabelEncoder()
    temp_enc.fit(combined[col])
    combined[col] = temp_enc.transform(combined[col])
    encoders.append(temp_enc)

In [17]:
train = combined.loc[combined.fecha_dato == '2015-05-28', :].reset_index(drop=True)
test = combined.loc[combined.fecha_dato == '2016-05-28', :].reset_index(drop=True)

del train['index'], test['index']
del train['fecha_dato'], test['fecha_dato']

added_products.set_index('ncodpers', inplace=True)

label_encoder = LabelEncoder()
label_encoder.fit(added_products)
added_products['encoded_products'] = label_encoder.transform(added_products['added_product'])

In [21]:
train.set_index('ncodpers', inplace=True)

xTrain = train.loc[added_products.index, :]

print train.shape
print xTrain.shape
print added_products.shape

param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.05
param['max_depth'] = 6
param['silent'] = 0
param['num_class'] = 22
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 2
param['subsample'] = 0.9
param['colsample_bytree'] = 0.9
param['seed'] = 1428
num_rounds = 100
plist = param.items()

xg_train = xgb.DMatrix(xTrain, label=added_products.loc[:, 'encoded_products'])
evallist  = [(xg_train,'train')]

In [26]:
xgb_model = xgb.train(plist, xg_train, num_rounds, evallist)

[0]	train-mlogloss:2.90573
[1]	train-mlogloss:2.77231
[2]	train-mlogloss:2.66007
[3]	train-mlogloss:2.56051
[4]	train-mlogloss:2.47907
[5]	train-mlogloss:2.4025
[6]	train-mlogloss:2.3346
[7]	train-mlogloss:2.27324
[8]	train-mlogloss:2.21839
[9]	train-mlogloss:2.17039
[10]	train-mlogloss:2.1228
[11]	train-mlogloss:2.07973
[12]	train-mlogloss:2.03906
[13]	train-mlogloss:2.00169
[14]	train-mlogloss:1.9667
[15]	train-mlogloss:1.93406
[16]	train-mlogloss:1.90349
[17]	train-mlogloss:1.87471
[18]	train-mlogloss:1.84758
[19]	train-mlogloss:1.82212
[20]	train-mlogloss:1.79804
[21]	train-mlogloss:1.7752
[22]	train-mlogloss:1.75334
[23]	train-mlogloss:1.73284
[24]	train-mlogloss:1.71348
[25]	train-mlogloss:1.69481
[26]	train-mlogloss:1.67756
[27]	train-mlogloss:1.66104
[28]	train-mlogloss:1.64494
[29]	train-mlogloss:1.62931
[30]	train-mlogloss:1.61459
[31]	train-mlogloss:1.60047
[32]	train-mlogloss:1.58696
[33]	train-mlogloss:1.57406
[34]	train-mlogloss:1.56158
[35]	train-mlogloss:1.54969
[36]	tr

In [27]:
test.set_index('ncodpers', inplace=True)
xg_test = xgb.DMatrix(test)

preds = xgb_model.predict(xg_test)

top_t_products = label_encoder.inverse_transform(np.argsort(preds, axis = 1)[:, ::-1][:, :])

In [None]:
t_head = test.head()

In [None]:
t_p = [' '.join(x) for x in tt]

In [30]:
test['xgb_preds'] = [' '.join(x) for x in top_t_products]
test['added_products'] = ['ind_recibo_ult1']*test.shape[0]

In [31]:
for i in tqdm(test.index):
    products = map(lambda x: x[0], filter(lambda x: x[1]==1, zip(HEADER[24:], test.loc[i, 'ind_ahor_fin_ult1':'ind_recibo_ult1'])))
    pred_products = test.loc[i, 'xgb_preds'].split()
    prod_string = ' '.join(filter(lambda x: x not in products, pred_products))
    test.set_value(i, 'added_products', prod_string)

100%|██████████| 931453/931453 [19:17<00:00, 804.63it/s]


In [33]:
submission = pd.read_csv('data/test_ver2.csv', usecols=[1])
submission['added_products'] = ['ind_recibo_ult1']*submission.shape[0]
submission.set_index('ncodpers', inplace=True)
submission.added_products = test.loc[submission.index, 'added_products']

In [None]:
for i in t_head.index:
    products = map(lambda x: x[0], filter(lambda x: x[1]==1, zip(HEADER[24:], t_head.loc[i, 'ind_ahor_fin_ult1':'ind_recibo_ult1'])))
    pred_products = t_head.loc[i, 'xgb_preds'].split()
    prod_string = ' '.join(filter(lambda x: x not in products, pred_products))
    t_head.set_value(i, 'added_products', prod_string)
    

In [36]:
from scripts.kaggle.helpers import make_submission
filename = 'data/submissions/my_first_xgb_sub_trained_on_jun_2015_only_added_users.csv'
description = 'my first xgboost submission on jun 2015 only added products data'
submission.to_csv(filename, columns=['added_products'])
make_submission(filename, description=description, submit=True, compress=True)
# make_submission('data/submissions/2_bayesian_prob_products_counters_with_old_1_ignored.csv', description="2. based on bayesian probabilities, where probs are calculated only for prods when there is a transition from 0 to 1", submit=True, compress=True)


Compressing file
Compression done
Uploading submission data/submissions/my_first_xgb_sub_trained_on_jun_2015_only_added_users.csv.zip
Upload done


In [35]:
submission.head()

Unnamed: 0_level_0,added_products
ncodpers,Unnamed: 1_level_1
15889,ind_recibo_ult1 ind_reca_fin_ult1 ind_ctop_fin...
1170544,ind_recibo_ult1 ind_reca_fin_ult1 ind_nom_pens...
1170545,ind_recibo_ult1 ind_nom_pens_ult1 ind_nomina_u...
1170547,ind_recibo_ult1 ind_nomina_ult1 ind_nom_pens_u...
1170548,ind_recibo_ult1 ind_nomina_ult1 ind_reca_fin_u...
