In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from scripts.kaggle.helpers import make_submission
import datetime
from tqdm import tqdm
import gc
import itertools
from multiprocessing import Pool
import pickle
import collections
import math

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# %pylab inline
# pylab.rcParams['figure.figsize'] = (12, 16)
plt.figure(figsize=(12,16))

%load_ext autotime

<matplotlib.figure.Figure at 0x7fbf70412510>

In [2]:
HEADER = ["fecha_dato", "ncodpers", "ind_empleado",
          "pais_residencia", "sexo", "age", "fecha_alta",
          "ind_nuevo", "antiguedad", "indrel", "ult_fec_cli_1t",
          "indrel_1mes", "tiprel_1mes", "indresi", "indext",
          "conyuemp", "canal_entrada", "indfall", "tipodom",
          "cod_prov", "nomprov", "ind_actividad_cliente",
          "renta", "segmento", "ind_ahor_fin_ult1",
          "ind_aval_fin_ult1", "ind_cco_fin_ult1",
          "ind_cder_fin_ult1", "ind_cno_fin_ult1",
          "ind_ctju_fin_ult1", "ind_ctma_fin_ult1",
          "ind_ctop_fin_ult1", "ind_ctpp_fin_ult1",
          "ind_deco_fin_ult1", "ind_deme_fin_ult1",
          "ind_dela_fin_ult1", "ind_ecue_fin_ult1",
          "ind_fond_fin_ult1", "ind_hip_fin_ult1",
          "ind_plan_fin_ult1", "ind_pres_fin_ult1",
          "ind_reca_fin_ult1", "ind_tjcr_fin_ult1",
          "ind_valo_fin_ult1", "ind_viv_fin_ult1",
          "ind_nomina_ult1", "ind_nom_pens_ult1",
          "ind_recibo_ult1"]

time: 7.44 ms


In [3]:
def get_persons_of_feature(train):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(train.loc[train[feat] == 1]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 2.97 ms


In [4]:
def jacobian_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    union = len1 + len2 - intersection
    if len1 == 0 or len1 == 0:
        return 0.0
    return intersection * 1.0 / union

def cosine_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    # union = len1 + len2 - intersection
    if len1 == 0 or len2 == 0:
        return 0.0
    return intersection * 1.0 / math.sqrt(len1*len2)

time: 9.51 ms


In [5]:
def get_similarity_matrices(train_df):
    persons_of_feature = get_persons_of_feature(train_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd


time: 9.14 ms


In [6]:
def get_wts_from_mat(df, row):
    brow = map(bool, row)
    wts = df.loc[brow, :].apply(sum)
    wts[brow] = 0
    return wts

time: 2.16 ms


In [7]:
def get_similarity_features(file_names): #train_file, added_file):
    train_file = file_names[0]
    added_file = file_names[1]
    print added_file
    train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
    train.fillna(0, inplace=True)
    for col in HEADER[24:]:
        train[col] = train[col].astype(int)
    if added_file == 'data/test_ver2.csv':
        added_products = pd.read_csv(added_file, usecols=[1])
    else:
        added_products = pd.read_csv(added_file, usecols=[0])
    jacob_pd, cosine_pd = get_similarity_matrices(train)
    train.set_index('ncodpers', inplace=True)
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = list(train.loc[i, HEADER[24:]])
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[2]

time: 18.4 ms


In [8]:
file_names_list = [('data/train_2015_05_28.csv', 'data/added_product_2015_05_28.csv', 1),
                   ('data/train_2015_06_28.csv', 'data/added_product_2015_06_28.csv', 2),
                   ('data/train_2015_07_28.csv', 'data/added_product_2015_07_28.csv', 3),
                   ('data/train_2015_08_28.csv', 'data/added_product_2015_08_28.csv', 4),
                   ('data/train_2015_09_28.csv', 'data/added_product_2015_09_28.csv', 5),
                   ('data/train_2015_10_28.csv', 'data/added_product_2015_10_28.csv', 6),
                   ('data/train_2015_11_28.csv', 'data/added_product_2015_11_28.csv', 7),
                   ('data/train_2015_12_28.csv', 'data/added_product_2015_12_28.csv', 8),
                   ('data/train_2016_01_28.csv', 'data/added_product_2016_01_28.csv', 9),
                   ('data/train_2016_02_28.csv', 'data/added_product_2016_02_28.csv', 10),
                   ('data/train_2016_03_28.csv', 'data/added_product_2016_03_28.csv', 11),
                   ('data/train_2016_04_28.csv', 'data/added_product_2016_04_28.csv', 12)
                  ]

test_file_data = ('data/train_2016_05_28.csv', 'data/test_ver2.csv', 13)

time: 5.86 ms


In [9]:
p = Pool(5)
results = p.map(get_similarity_features, file_names_list)
p.close()
p.join()

data/added_product_2015_07_28.csv
data/added_product_2015_09_28.csv
data/added_product_2015_05_28.csv
data/added_product_2015_06_28.csv
data/added_product_2015_08_28.csv


100%|██████████| 29490/29490 [01:45<00:00, 278.22it/s]


data/added_product_2015_10_28.csv


100%|██████████| 33362/33362 [02:04<00:00, 268.59it/s]
 85%|████████▌ | 33994/39803 [02:03<00:21, 265.03it/s]

data/added_product_2015_11_28.csv


100%|██████████| 35696/35696 [02:09<00:00, 274.61it/s]


data/added_product_2015_12_28.csv


100%|██████████| 39803/39803 [02:22<00:00, 278.71it/s]
 25%|██▌       | 8969/35215 [00:33<01:38, 267.31it/s]

data/added_product_2016_01_28.csv


100%|██████████| 41745/41745 [02:34<00:00, 271.02it/s]
 16%|█▋        | 6983/42353 [00:26<02:14, 262.76it/s]

data/added_product_2016_02_28.csv


100%|██████████| 35215/35215 [02:13<00:00, 264.72it/s]
 66%|██████▋   | 23380/35203 [01:25<00:43, 269.77it/s]

data/added_product_2016_03_28.csv


100%|██████████| 30956/30956 [01:52<00:00, 276.05it/s]
 69%|██████▊   | 24145/35203 [01:28<00:41, 265.75it/s]

data/added_product_2016_04_28.csv


100%|██████████| 35203/35203 [02:07<00:00, 275.08it/s]
100%|██████████| 42353/42353 [02:41<00:00, 261.47it/s]
100%|██████████| 49024/49024 [03:01<00:00, 269.46it/s]
100%|██████████| 32962/32962 [01:58<00:00, 277.61it/s]
100%|██████████| 35843/35843 [02:11<00:00, 273.07it/s]


time: 6min 24s


In [10]:
[x[1] for x in results]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

time: 3.21 ms


In [11]:
jacob_train = pd.concat(tuple([x[0] for x in results]))

time: 78.7 ms


In [12]:
jacob_train.shape

(441652, 48)

time: 2.17 ms


In [14]:
pickle.dump(jacob_train, open('data/15.Jacobian_similarity_features.train_data', 'wb'))
make_submission('data/15.Jacobian_similarity_features.train_data', upload_to_s3=True)

Compressing file
Compression done
Uploading file to s3 data/15.Jacobian_similarity_features.train_data.zip
uploading to s3 done
time: 28.7 s


In [15]:
jacob_test, ind = get_similarity_features(test_file_data)

data/test_ver2.csv


100%|██████████| 929615/929615 [1:01:53<00:00, 250.32it/s]

time: 1h 1min 59s





In [16]:
jacob_test.shape

(929615, 48)

time: 2.53 ms


In [18]:
pickle.dump(jacob_test, open('data/15.Jacobian_similarity_features.test_data', 'wb'))
make_submission('data/15.Jacobian_similarity_features.test_data', upload_to_s3=True)

Compressing file
Compression done
Uploading file to s3 data/15.Jacobian_similarity_features.test_data.zip
uploading to s3 done
time: 4min 17s


In [20]:
desc, test_ncodpers, preds, label_encoder = pickle.load(open('data/11.xgb_trained_on_all_months_only_added_users_product_features_generated_from_last_4_months.csv.pickle', 'rb'))

time: 9.52 s


In [21]:
len(label_encoder.classes_)

24

time: 2.72 ms


In [24]:
jacobs = pd.DataFrame()
jacobs['ncodpers'] = jacob_test.index
jacobs.set_index('ncodpers', inplace=True)

time: 71.4 ms


In [25]:
for col in HEADER[24:]:
    jacobs[col] = jacob_test['jacob_'+col]

time: 85.1 ms


In [27]:
preds.shape

(929615, 24)

time: 2.57 ms


In [30]:
jacobs.columns = label_encoder.transform(jacobs.columns)

time: 1.64 ms


In [31]:
jacobs.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,17,18,19,21,22,23,16,15,20
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15889,0.000884,0.000769,0.0,0.006847,0.415941,5e-06,0.028505,0.275887,0.0,0.005102,...,0.094846,0.148436,0.012952,0.40085,0.0,0.0,0.043409,0.414067,0.434003,0.497496
1170544,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.006558,0.001393,0.037283,0.02669,0.026649,0.003207,0.011827,0.012634,0.10595
1170545,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.006558,0.001393,0.037283,0.02669,0.026649,0.003207,0.011827,0.012634,0.10595
1170547,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.006558,0.001393,0.037283,0.02669,0.026649,0.003207,0.011827,0.012634,0.10595
1170548,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.006558,0.001393,0.037283,0.02669,0.026649,0.003207,0.011827,0.012634,0.10595


time: 41 ms


In [32]:
jacobs_1 = jacobs[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]

time: 168 ms


In [33]:
jacobs_1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15889,0.000884,0.000769,0.0,0.006847,0.415941,5e-06,0.028505,0.275887,0.0,0.005102,...,0.094846,0.434003,0.414067,0.148436,0.012952,0.40085,0.497496,0.0,0.0,0.043409
1170544,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.012634,0.011827,0.006558,0.001393,0.037283,0.10595,0.02669,0.026649,0.003207
1170545,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.012634,0.011827,0.006558,0.001393,0.037283,0.10595,0.02669,0.026649,0.003207
1170547,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.012634,0.011827,0.006558,0.001393,0.037283,0.10595,0.02669,0.026649,0.003207
1170548,8.2e-05,1.4e-05,0.0,0.000409,0.011386,5e-06,0.00771,0.110624,0.031354,0.000463,...,0.002969,0.012634,0.011827,0.006558,0.001393,0.037283,0.10595,0.02669,0.026649,0.003207


time: 42.7 ms


In [34]:
mat = jacobs_1.as_matrix()

time: 1.28 ms


In [35]:
mat.shape

(929615, 24)

time: 2.92 ms


In [36]:
mat

array([[  8.83835818e-04,   7.68736655e-04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   4.34094628e-02],
       [  8.19019776e-05,   1.42444309e-05,   0.00000000e+00, ...,
          2.66898708e-02,   2.66487295e-02,   3.20735506e-03],
       [  8.19019776e-05,   1.42444309e-05,   0.00000000e+00, ...,
          2.66898708e-02,   2.66487295e-02,   3.20735506e-03],
       ..., 
       [  4.29703870e-04,   5.40054670e-05,   0.00000000e+00, ...,
          8.61278040e-02,   8.35131729e-02,   1.66351659e-02],
       [  8.19019776e-05,   1.42444309e-05,   0.00000000e+00, ...,
          2.66898708e-02,   2.66487295e-02,   3.20735506e-03],
       [  3.47801892e-04,   3.97610362e-05,   1.10623988e-01, ...,
          5.94379332e-02,   5.68644435e-02,   1.34278108e-02]])

time: 4.2 ms


In [38]:
preds.shape

(929615, 24)

time: 2.51 ms


In [39]:
new_preds = np.multiply(mat, preds)

time: 159 ms


In [65]:
top_t_products = label_encoder.inverse_transform(np.argsort(new_preds, axis=1)
                                                 [:, ::-1][:, :])

time: 1.3 s


In [60]:
submission = pd.DataFrame()
submission['ncodpers'] = test.index
submission['added_products'] = ['ind_recibo_ult1']*submission.shape[0]
submission.set_index('ncodpers', inplace=True)

time: 57 ms


In [42]:
test = pd.read_csv('data/test_ver2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


time: 3 s


In [43]:
test.fillna(0, inplace=True)

time: 620 ms


In [44]:
test.set_index('ncodpers', inplace=True)

time: 22.2 ms


In [54]:
test_copy = test.loc[test_ncodpers, :]

time: 1.77 s


In [57]:
test_old = pd.read_csv('data/train_2016_05_28.csv', header=None, names=HEADER)

time: 3.12 s


  interactivity=interactivity, compiler=compiler, result=result)


In [58]:
test_old.fillna(0, inplace=True)
test_old.set_index('ncodpers', inplace=True)

time: 585 ms


In [55]:
test_copy.head()

Unnamed: 0_level_0,fecha_dato,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,...,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
869505,2016-06-28,N,ES,H,26,2010-04-13,0,75,1,0,...,N,0,KAT,N,1,4.0,ALMERIA,1,112771.23,02 - PARTICULARES
1477437,2016-06-28,N,ES,H,22,2015-10-13,0,8,1,0,...,N,0,KHQ,N,1,28.0,MADRID,1,,03 - UNIVERSITARIO
1533309,2016-06-28,N,ES,H,52,2016-02-19,1,4,1,0,...,N,0,KHN,N,1,12.0,CASTELLON,1,,02 - PARTICULARES
1484304,2016-06-28,N,ES,H,25,2015-10-20,0,8,1,0,...,N,0,KHQ,N,1,28.0,MADRID,0,,03 - UNIVERSITARIO
1484306,2016-06-28,N,ES,H,22,2015-10-20,0,8,1,0,...,N,0,KHQ,N,1,28.0,MADRID,0,,03 - UNIVERSITARIO


time: 51.6 ms


In [66]:
test_copy['xgb_preds'] = [' '.join(x) for x in top_t_products]
test_copy['added_products'] = ['ind_recibo_ult1']*test_copy.shape[0]
for i in tqdm(test_copy.index):
    zipped = zip(HEADER[24:],
                 test_old.loc[i, 'ind_ahor_fin_ult1':'ind_recibo_ult1'])
    products = [x[0] for x in
                [y for y in zipped if y[1] == 1]]
    pred_products = test_copy.loc[i, 'xgb_preds'].split()
    prod_string = ' '.join(filter(lambda x: x not in products, pred_products))
    test_copy.set_value(i, 'added_products', prod_string)

100%|██████████| 929615/929615 [08:45<00:00, 1769.25it/s]

time: 8min 48s





In [61]:
submission.head()

Unnamed: 0_level_0,added_products
ncodpers,Unnamed: 1_level_1
15889,ind_recibo_ult1
1170544,ind_recibo_ult1
1170545,ind_recibo_ult1
1170547,ind_recibo_ult1
1170548,ind_recibo_ult1


time: 5.63 ms


In [67]:
submission.added_products = test_copy.loc[submission.index, 'added_products']

time: 289 ms


In [63]:
submission.head()

Unnamed: 0_level_0,added_products
ncodpers,Unnamed: 1_level_1
15889,ind_recibo_ult1 ind_reca_fin_ult1 ind_dela_fin...
1170544,ind_recibo_ult1 ind_reca_fin_ult1 ind_cno_fin_...
1170545,ind_recibo_ult1 ind_nomina_ult1 ind_nom_pens_u...
1170547,ind_recibo_ult1 ind_reca_fin_ult1 ind_cno_fin_...
1170548,ind_recibo_ult1 ind_reca_fin_ult1 ind_cno_fin_...


time: 5.75 ms


In [68]:
filename = 'data/15.11_submission_weighted_with_jacobian_wts.csv'
description = '15. 11. submission weighted with jacobian weights'
submission.to_csv(filename, columns=['added_products'])
make_submission(filename, description=description, submit=True, compress=True, upload_to_s3=True)

Compressing file
Compression done
Uploading file to s3 data/15.11_submission_weighted_with_jacobian_wts.csv.zip
uploading to s3 done
Uploading submission data/15.11_submission_weighted_with_jacobian_wts.csv.zip
Upload done
time: 1min 25s
