In [1]:
import numpy as np
import pandas as pd 
from collections import defaultdict
from pprint import pprint
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
demographic_cols = ['ncodpers','fecha_alta','ind_empleado','pais_residencia','sexo','age','ind_nuevo','antiguedad','indrel',
 'indrel_1mes','tiprel_1mes','indresi','indext','conyuemp','canal_entrada','indfall',
 'tipodom','cod_prov','ind_actividad_cliente','renta','segmento']

notuse = ["ult_fec_cli_1t","nomprov",'fecha_dato']

product_col = [
 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1',
 'ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1',
 'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1',
 'ind_nom_pens_ult1','ind_recibo_ult1']

# Import Data

In [3]:
df_train = pd.read_csv('cleaned_data/DataMulticlass_6_withpast2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_test = pd.read_csv('cleaned_data/TestSet_withpast3.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
pd.set_option('display.max_columns', None)

# Clean Data

### Filter Data

In [6]:
def filter_data(df):
    df = df[df['ind_nuevo'] == 0]
    df = df[df['antiguedad'] != -999999]
    df = df[df['indrel'] == 1]
    df = df[df['indresi'] == 'S']
    df = df[df['indfall'] == 'N']
    df = df[df['tipodom'] == 1]
    df = df[df['ind_empleado'] == 'N']
    df = df[df['pais_residencia'] == 'ES']
    df = df[df['indrel_1mes'] == 1]
    df = df[df['tiprel_1mes'] == ('A' or 'I')]
    df = df[df['indext'] == 'N']

In [7]:
filter_data(df_train)

### Drop unneccessary column

In [8]:
drop_column = ['ind_nuevo','indrel','indresi','indfall','tipodom','ind_empleado','pais_residencia','indrel_1mes','indext','conyuemp','fecha_alta','tiprel_1mes']

df_train.drop(drop_column, axis=1, inplace = True)
df_test.drop(drop_column, axis=1, inplace = True)

### Add missing income

In [9]:
df_test["renta"]   = pd.to_numeric(df_test["renta"], errors="coerce")
unique_prov = df_test[df_test.cod_prov.notnull()].cod_prov.unique()
grouped = df_test.groupby("cod_prov")["renta"].median()

def impute_renta(df):
    df["renta"]   = pd.to_numeric(df["renta"], errors="coerce")       
    for cod in unique_prov:
        df.loc[df['cod_prov']==cod,['renta']] = df.loc[df['cod_prov']==cod,['renta']].fillna({'renta':grouped[cod]}).values
    df.renta.fillna(df_test["renta"].median(), inplace=True)
    
impute_renta(df_train)
impute_renta(df_test)

In [10]:
def drop_na(df):
    df.dropna(axis = 0, subset = ['ind_actividad_cliente'], inplace = True)
    
drop_na(df_train)

### Convert and make dummy

In [11]:
# These column are categories feature, I'll transform them using get_dummy
dummy_col = ['sexo','canal_entrada','cod_prov','segmento']
dummy_col_select = ['canal_entrada','cod_prov']

In [12]:
limit = int(0.05 * len(df_train.index))
use_dummy_col = {}

for col in dummy_col_select:
    trainlist = df_train[col].value_counts()
    use_dummy_col[col] = []
    for i,item in enumerate(trainlist):
        if item > limit:
            use_dummy_col[col].append(df_train[col].value_counts().index[i])   

In [13]:
def get_dummy(df):
    for col in dummy_col_select:
        for item in df[col].unique(): 
            if item not in use_dummy_col[col]:
                row_index = df[col] == item
                df.loc[row_index,col] = np.nan
    return pd.get_dummies(df, prefix=dummy_col, columns = dummy_col)
    
df_train = get_dummy(df_train)
df_test = get_dummy(df_test)

In [14]:
def clean_age(df):
    df["age"]   = pd.to_numeric(df["age"], errors="coerce")
    max_age = 80 
    log_max_age = np.log(max_age) 
    square_max_age  = np.square(max_age)
    df["age"]   = df['age'].apply(lambda x: min(x ,max_age))
    df["log_age"]   = df['age'].apply(lambda x: round(np.log10(x+1)/log_max_age, 6))
    df["square_age"]   = df['age'].apply(lambda x: round(np.square(x)/square_max_age, 6))
    df["age"]   = df['age'].apply(lambda x: round( x/max_age, 6))

def clean_renta(df):
    max_renta = 1.0e6
    log_max_renta = np.log(max_renta) 
    square_max_renta  = np.square(max_renta)
    df["renta"]   = df['renta'].apply(lambda x: min(x ,max_renta))
    df["log_renta"]   = df['renta'].apply(lambda x: round(np.log10(x+1)/log_max_renta, 6))
    df["square_renta"]   = df['renta'].apply(lambda x: round(np.square(x)/square_max_renta, 6))
    df["renta"]   = df['renta'].apply(lambda x: round( x/max_renta, 6))
    
def clean_antigue(df):
    df["antiguedad"]   = pd.to_numeric(df["antiguedad"], errors="coerce")
    df["antiguedad"] = df["antiguedad"].replace(-999999, df['antiguedad'].median())
    max_antigue = 256
    log_max_antigue = np.log(max_antigue) 
    square_max_antigue  = np.square(max_antigue)
    df["antiguedad"]   = df['antiguedad'].apply(lambda x: min(x ,max_antigue))
    df["log_antiguedad"]   = df['antiguedad'].apply(lambda x: round(np.log10(x+1)/log_max_antigue, 6))
    df["square_antiguedad"]   = df['antiguedad'].apply(lambda x: round(np.square(x)/square_max_antigue, 6))
    df["antiguedad"]   = df['antiguedad'].apply(lambda x: round( x/max_antigue, 6))  

In [15]:
clean_age(df_train)
clean_age(df_test)

clean_renta(df_train)
clean_renta(df_test)

clean_antigue(df_train)
clean_antigue(df_test)



In [16]:
product_col_5 = [col for col in df_train.columns if '_ult1_5' in col]
product_col_4 = [col for col in df_train.columns if '_ult1_4' in col]
product_col_3 = [col for col in df_train.columns if '_ult1_3' in col]
product_col_2 = [col for col in df_train.columns if '_ult1_2' in col]
product_col_1 = [col for col in df_train.columns if '_ult1_1' in col]

df_train['tot'] = df_train[product_col_5].sum(axis=1)
df_test['tot'] = df_test[product_col_5].sum(axis=1)

In [17]:
for col in product_col[2:]:
    df_train[col+'_past'] = (df_train[col+'_5']+df_train[col+'_4']+df_train[col+'_3']+df_train[col+'_2']+df_train[col+'_1'])/5
    df_test[col+'_past'] = (df_test[col+'_5']+df_test[col+'_4']+df_test[col+'_3']+df_test[col+'_2']+df_test[col+'_1'])/5

In [18]:
for pro in product_col[2:]:
    df_train[pro+'_past'] = df_train[pro+'_past']*(1-df_train[pro+'_5'])
    df_test[pro+'_past'] = df_test[pro+'_past']*(1-df_test[pro+'_5'])

# Model

In [None]:
def runXGB(train_X, train_y, colsample_bytree=0.9, max_depth= 6, eta=0.1, min_child_weight=2, subsample=0.9, num_rounds=150):
    param = {}
    param['objective'] = 'multi:softprob'
    param['seed'] = 0
    param['silent'] = 0
    param['eval_metric'] = "mlogloss"
    param['booster'] = 'gbtree'
    param['num_class'] = 22
    param['reg_lambda'] =100
    param['colsample_bytree'] = colsample_bytree
    param['max_depth'] = max_depth 
    param['eta'] = eta
    param['min_child_weight'] = min_child_weight
    param['subsample'] = subsample
    num_round = num_rounds

    progress = dict()
    plst = list(param.items())
    
    #xgtrain = xgb.DMatrix(train_X.loc[df_train.index.values%10!=0], label=train_y.loc[df_train.index.values%10!=0])
    #xgtest = xgb.DMatrix(train_X.loc[df_train.index.values%10==0], label=train_y.loc[df_train.index.values%10==0])
    #watchlist  = [(xgtrain,'train'),(xgtest,'test')]
    #model = xgb.train(plst, xgtrain, num_rounds,watchlist,evals_result=progress)
    
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    watchlist  = [(xgtrain,'train')]
    model = xgb.train(plst, xgtrain, int(num_rounds/0.9), watchlist, evals_result=progress)
    return (model, progress)

In [None]:
cols = list(df_train.drop(['target','ncodpers'], 1).columns.values)

id_preds = defaultdict(list)
ids = df_test['ncodpers'].values

# predict model 
y_train = df_train['target']
x_train = df_train[cols]
    
(clf, progress) = runXGB(x_train, y_train)
          
x_test = df_test[cols]
x_test = x_test.fillna(0) # check this
        
d_test = xgb.DMatrix(x_test)
p_test = clf.predict(d_test)
        
for id, p in zip(ids, p_test):
    #id_preds[id] = list(p)
    id_preds[id] = [0,0] + list(p)

In [None]:
plt.plot(progress['test']['mlogloss'])
plt.plot(progress['train']['mlogloss'])

# Model 2: Product Ranking 

# Make submission

In [None]:
df_recent =  pd.read_csv('cleaned_data/df_recent.csv')

In [None]:
sample = pd.read_csv('input/sample_submission.csv')

In [None]:
# check if customer already have each product or not. 
already_active = {}
for row in df_recent.values:
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(tuple(product_col), row) if c[1] > 0]
    already_active[id] = active

# add 7 products(that user don't have yet), higher probability first -> train_pred   
train_preds = {}
for id, p in id_preds.items():
    preds = [i[0] for i in sorted([i for i in zip(tuple(product_col), p) if i[0] not in already_active[id]],
                                  key=lambda i:i [1], 
                                  reverse=True)[:7]]
    train_preds[id] = preds
    
test_preds = []
for row in sample.values:
    id = row[0]
    p = train_preds[id]
    test_preds.append(' '.join(p))


In [None]:
sample.shape

In [None]:
sample['added_products'] = test_preds
sample.to_csv('output/XGBmulticlass_withpast5.csv', index=False)

# Validation part

In [20]:
cols = list(df_train.drop(['target','ncodpers'], 1).columns.values)

id_preds = defaultdict(list)
ids = df_test['ncodpers'].values
 
# predict model 
y_train = df_train['target']
x_train = df_train[cols]

print("Validating...")

param = {}
param['objective'] = 'multi:softprob'
param['seed'] = 0
param['silent'] = 0
param['eval_metric'] = "mlogloss"
param['booster'] = 'gbtree'
param['eta'] = 0.1
param['num_class'] = 22
param['colsample_bytree'] = 0.9
param['subsample'] = 0.9
param['max_depth'] = 6
param['min_child_weight'] = 2
param['reg_lambda'] =100
num_round = 150

plst = list(param.items())
  
xgtrain = xgb.DMatrix(x_train, label=y_train)

xgb.cv(param, xgtrain, num_round, nfold=3,
       metrics={'mlogloss'}, seed = 0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

Validating...
[0]	train-mlogloss:2.72088+0.00153514	test-mlogloss:2.7226+0.000930488
[1]	train-mlogloss:2.4869+0.00621075	test-mlogloss:2.48924+0.0100276
[2]	train-mlogloss:2.30551+0.00410903	test-mlogloss:2.30883+0.00869148
[3]	train-mlogloss:2.16584+0.0029511	test-mlogloss:2.16984+0.00812009
[4]	train-mlogloss:2.04793+0.00155158	test-mlogloss:2.05291+0.00714633
[5]	train-mlogloss:1.94851+0.00157262	test-mlogloss:1.95431+0.00726233
[6]	train-mlogloss:1.86207+0.00152311	test-mlogloss:1.86866+0.00674262
[7]	train-mlogloss:1.78633+0.00137277	test-mlogloss:1.79355+0.00669338
[8]	train-mlogloss:1.71954+0.00139916	test-mlogloss:1.72742+0.00621573
[9]	train-mlogloss:1.6604+0.00150112	test-mlogloss:1.66905+0.0061865
[10]	train-mlogloss:1.60793+0.00171367	test-mlogloss:1.61722+0.00598067
[11]	train-mlogloss:1.56051+0.00183211	test-mlogloss:1.57038+0.00615759
[12]	train-mlogloss:1.51787+0.00195822	test-mlogloss:1.52839+0.00625863
[13]	train-mlogloss:1.47907+0.00209964	test-mlogloss:1.49024+0.00

Unnamed: 0,test-mlogloss-mean,test-mlogloss-std,train-mlogloss-mean,train-mlogloss-std
0,2.722604,0.000930,2.720884,0.001535
1,2.489237,0.010028,2.486899,0.006211
2,2.308827,0.008691,2.305514,0.004109
3,2.169842,0.008120,2.165842,0.002951
4,2.052907,0.007146,2.047926,0.001552
5,1.954306,0.007262,1.948510,0.001573
6,1.868662,0.006743,1.862075,0.001523
7,1.793553,0.006693,1.786333,0.001373
8,1.727420,0.006216,1.719537,0.001399
9,1.669053,0.006186,1.660403,0.001501


cv-mlogloss: 0.988865+std0.00640186