In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from scripts.kaggle.helpers import make_submission
import datetime
from tqdm import tqdm
import gc
import itertools
from multiprocessing import Pool
import pickle
import collections
import math

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

%pylab inline
pylab.rcParams['figure.figsize'] = (12, 16)

%load_ext autotime

Populating the interactive namespace from numpy and matplotlib


In [3]:
HEADER = ["fecha_dato", "ncodpers", "ind_empleado",
          "pais_residencia", "sexo", "age", "fecha_alta",
          "ind_nuevo", "antiguedad", "indrel", "ult_fec_cli_1t",
          "indrel_1mes", "tiprel_1mes", "indresi", "indext",
          "conyuemp", "canal_entrada", "indfall", "tipodom",
          "cod_prov", "nomprov", "ind_actividad_cliente",
          "renta", "segmento", "ind_ahor_fin_ult1",
          "ind_aval_fin_ult1", "ind_cco_fin_ult1",
          "ind_cder_fin_ult1", "ind_cno_fin_ult1",
          "ind_ctju_fin_ult1", "ind_ctma_fin_ult1",
          "ind_ctop_fin_ult1", "ind_ctpp_fin_ult1",
          "ind_deco_fin_ult1", "ind_deme_fin_ult1",
          "ind_dela_fin_ult1", "ind_ecue_fin_ult1",
          "ind_fond_fin_ult1", "ind_hip_fin_ult1",
          "ind_plan_fin_ult1", "ind_pres_fin_ult1",
          "ind_reca_fin_ult1", "ind_tjcr_fin_ult1",
          "ind_valo_fin_ult1", "ind_viv_fin_ult1",
          "ind_nomina_ult1", "ind_nom_pens_ult1",
          "ind_recibo_ult1"]

time: 7.33 ms


In [5]:
def get_persons_of_feature(train):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(train.loc[train[feat] == 1]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 2.74 ms


In [6]:
def jacobian_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    union = len1 + len2 - intersection
    if len1 == 0 or len1 == 0:
        return 0.0
    return intersection * 1.0 / union

def cosine_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    # union = len1 + len2 - intersection
    if len1 == 0 or len2 == 0:
        return 0.0
    return intersection * 1.0 / math.sqrt(len1*len2)

time: 10.5 ms


In [7]:
def get_similarity_matrices(train_df):
    persons_of_feature = get_persons_of_feature(train_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd


time: 10.2 ms


In [16]:
def get_similarity_matrices_from_added(added_df):
    persons_of_feature = get_persons_of_feature_from_added(added_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd

def get_persons_of_feature_from_added(added):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(added.loc[added['added_product'] == feat]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 22.6 ms


In [9]:
def get_wts_from_mat(df, row):
    brow = map(bool, row)
    wts = df.loc[brow, :].apply(sum)
    wts[brow] = 0
    return wts

time: 2.5 ms


In [14]:
def get_similarity_features(file_names): #train_file, added_file):
    train_file = file_names[0]
    added_file = file_names[1]
    train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
    train.fillna(0, inplace=True)
    for col in HEADER[24:]:
        train[col] = train[col].astype(int)
    added_products = pd.read_csv(added_file, usecols=[0])
    jacob_pd, cosine_pd = get_similarity_matrices(train)
    train.set_index('ncodpers', inplace=True)
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = list(train.loc[i, HEADER[24:]])
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[2]

time: 17.9 ms


In [8]:
simi_feat = get_similarity_features('data/train_2015_05_28.csv', 'data/added_product_2015_05_28.csv')

100%|██████████| 41745/41745 [04:41<00:00, 148.17it/s]

time: 4min 45s





In [13]:
def get_similarity_features_from_added(file_names): #train_file, added_file):
    train_file = file_names[0]
    similarity_file = file_names[1]
    added_file = file_names[2]
    train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
    train.fillna(0, inplace=True)
    for col in HEADER[24:]:
        train[col] = train[col].astype(int)
    added_products = pd.read_csv(added_file, usecols=[0])
    similarity_df = pd.read_csv(similarity_file)
    jacob_pd, cosine_pd = get_similarity_matrices_from_added(similarity_df)
    train.set_index('ncodpers', inplace=True)
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_added_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_added_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = list(train.loc[i, HEADER[24:]])
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_added_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_added_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[3], jacob_pd, cosine_pd

time: 21.3 ms


In [17]:
tdf, ind, j_pd, c_pd = get_similarity_features_from_added(('data/train_2015_05_28.csv', 'data/added_product_2015_04_28.csv', 'data/added_product_2015_05_28.csv', 1))

100%|██████████| 41745/41745 [05:25<00:00, 128.37it/s]

time: 5min 27s





In [18]:
tdf.head()

Unnamed: 0_level_0,jacob_added_ind_ahor_fin_ult1,cosine_added_ind_ahor_fin_ult1,jacob_added_ind_aval_fin_ult1,cosine_added_ind_aval_fin_ult1,jacob_added_ind_cco_fin_ult1,cosine_added_ind_cco_fin_ult1,jacob_added_ind_cder_fin_ult1,cosine_added_ind_cder_fin_ult1,jacob_added_ind_cno_fin_ult1,cosine_added_ind_cno_fin_ult1,...,jacob_added_ind_valo_fin_ult1,cosine_added_ind_valo_fin_ult1,jacob_added_ind_viv_fin_ult1,cosine_added_ind_viv_fin_ult1,jacob_added_ind_nomina_ult1,cosine_added_ind_nomina_ult1,jacob_added_ind_nom_pens_ult1,cosine_added_ind_nom_pens_ult1,jacob_added_ind_recibo_ult1,cosine_added_ind_recibo_ult1
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15892,0.0,0.0,0.0,0.0,0.036062,0.082235,0.000141,0.00449,0.0,0.0,...,0.0,0.0,0.0,0.0,0.190078,0.357068,0.194755,0.364195,0.0,0.0
15897,0.0,0.0,0.0,0.0,0.0,0.0,0.000141,0.00449,0.0,0.0,...,0.0,0.0,0.0,0.0,0.193767,0.370162,0.198642,0.377699,0.0,0.0
15906,0.0,0.0,0.0,0.0,0.025829,0.064539,0.000141,0.00449,0.0,0.0,...,0.005036,0.019046,0.0,0.0,0.185333,0.347796,0.190196,0.355342,0.0,0.0
15925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009018,0.019958,...,0.00297,0.011834,0.0,0.0,0.013948,0.03021,0.014455,0.031224,0.023277,0.055703
15927,0.0,0.0,0.0,0.0,0.01158,0.032723,0.000141,0.00449,0.016002,0.047308,...,0.000824,0.005028,0.0,0.0,0.018451,0.044834,0.0186,0.0452,0.0,0.0


time: 61.8 ms


---
# added in the last two months data set computation
---

In [55]:
def get_file_string_from_train_data(train_data):
    date_time = datetime.datetime(train_data[0], train_data[1], 28)
    return '_'.join(str(date_time.date()).split('-'))

def get_date_string_from_train_data(train_data):
    date_time = datetime.datetime(train_data[0], train_data[1], 28)
    return '-'.join(str(date_time.date()).split('-'))

def subtract_one_month(train_data):
    if train_data[1] == 1:
        return 2015, 12
    else:
        return train_data[0], train_data[1] - 1
    
som = subtract_one_month



def added_in_the_last_2_months(date):
    """
    date = (2015, 5)
    """
    
    
    current_train = get_date_string_from_train_data(date)
    last_month = get_date_string_from_train_data(som(date))
    last_last_month = get_date_string_from_train_data(som(som(date)))
    

    added = pd.read_csv('data/added_product_' + current_train)
    
    similarity_df = pd.read_csv('data/added_product_' + last_month)
    jacob_pd, cosine_pd = get_similarity_matrices_from_added(similarity_df)
    
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_added_2_old_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_added_2_old_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = get_added_row(date, i, added_df)
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_added_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_added_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[3], jacob_pd, cosine_pd
        
        
def get_added_row(date, ncod, added_df):
    current_month = get_date_string_from_train_data(date)
    last_month = get_date_string_from_train_data(som(date))
    last_last_month = get_date_string_from_train_data(som(som(date)))
    
    added_row = added_df.loc[(added_df['fecha_dato'] == current_month) & (added_df['ncodpers']==ncod), HEADER[24:]]
    
#     current_subs = main_train.loc[(main_train['fecha_dato'] == current_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
    
#     two_months_old = main_train.loc[(main_train['fecha_dato'] == last_last_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
#     if len(two_months_old) == 0:
#         two_months_old = main_train.loc[(main_train['fecha_dato'] == last_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
#         if len(two_months_old) == 0:
#             return [0]*24
#     added_row = [(t^r) & r for t, r in zip(two_months_old.values[0], current_subs.values[0])]
    return added_row

    
    

time: 83.5 ms


In [59]:
main_train = pd.read_csv('data/train_ver2.csv', usecols=range(2) + range(24, 48))

time: 58.7 s


In [68]:
main_train.fillna(0, inplace=True)
for col in HEADER[24:]:
    main_train[col] = main_train[col].astype(int)

time: 5.8 s


In [56]:
train = pd.read_csv('data/train_2015_05_28.csv', header=None, names=HEADER[:3], usecols=range(3))

time: 1.02 s


In [118]:
def f(x):
    x.set_index('fecha_dato', inplace=True)
    temp = pd.DataFrame()
    temp['fecha_dato'] = x.index[2:]
    # temp['ncodpers'] = [int(x.ncodpers[0])]*temp.shape[0]
    for col in HEADER[24:]:
        temp[col] = [0]*temp.shape[0]
#     print temp
    for i in temp.index:
        cur_date = temp.loc[i, 'fecha_dato']
#         print cur_date, 'cur date'
        old_date = get_date_string_from_train_data(som(som(tuple(map(int, cur_date.split('-')[:2])))))
#         print old_date, 'old date'
        cur = list(x.loc[cur_date, HEADER[24:]])
        old = list(x.loc[old_date, HEADER[24:]])
        temp.loc[i, HEADER[24:]] = is_added(old, cur)
#         print cur, 'cur'
#         print is_added(old,cur), 'isadded'
    return temp

def is_added(old, cur):
    old = map(int, old)
    cur = map(int, cur)
    return [(o^c)& c for o, c in zip(old,cur)]

time: 10.2 ms


In [None]:
sorted_tr = pd.read_csv('data/sample_sorted_train.csv', usecols=range(2)+range(24,48))
sorted_tr.fillna(0, inplace=True)
for col in HEADER[24:]:
    sorted_tr[col] = sorted_tr[col].astype(int)
g = sorted_tr.groupby('ncodpers')
x = g.apply(f)

x.reset_index(inplace=True)

del x['level_1']