In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from scripts.kaggle.helpers import make_submission
import datetime
from tqdm import tqdm
import gc
import itertools
from multiprocessing import Pool
import pickle
import collections
import math

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

%pylab inline
pylab.rcParams['figure.figsize'] = (12, 16)

%load_ext autotime

Populating the interactive namespace from numpy and matplotlib


In [2]:
HEADER = ["fecha_dato", "ncodpers", "ind_empleado",
          "pais_residencia", "sexo", "age", "fecha_alta",
          "ind_nuevo", "antiguedad", "indrel", "ult_fec_cli_1t",
          "indrel_1mes", "tiprel_1mes", "indresi", "indext",
          "conyuemp", "canal_entrada", "indfall", "tipodom",
          "cod_prov", "nomprov", "ind_actividad_cliente",
          "renta", "segmento", "ind_ahor_fin_ult1",
          "ind_aval_fin_ult1", "ind_cco_fin_ult1",
          "ind_cder_fin_ult1", "ind_cno_fin_ult1",
          "ind_ctju_fin_ult1", "ind_ctma_fin_ult1",
          "ind_ctop_fin_ult1", "ind_ctpp_fin_ult1",
          "ind_deco_fin_ult1", "ind_deme_fin_ult1",
          "ind_dela_fin_ult1", "ind_ecue_fin_ult1",
          "ind_fond_fin_ult1", "ind_hip_fin_ult1",
          "ind_plan_fin_ult1", "ind_pres_fin_ult1",
          "ind_reca_fin_ult1", "ind_tjcr_fin_ult1",
          "ind_valo_fin_ult1", "ind_viv_fin_ult1",
          "ind_nomina_ult1", "ind_nom_pens_ult1",
          "ind_recibo_ult1"]

time: 10.2 ms


In [3]:
def get_persons_of_feature(train):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(train.loc[train[feat] == 1]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 2.87 ms


In [4]:
def jacobian_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    union = len1 + len2 - intersection
    if len1 == 0 or len1 == 0:
        return 0.0
    return intersection * 1.0 / union

def cosine_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    # union = len1 + len2 - intersection
    if len1 == 0 or len2 == 0:
        return 0.0
    return intersection * 1.0 / math.sqrt(len1*len2)

time: 12.4 ms


In [5]:
def get_similarity_matrices(train_df):
    persons_of_feature = get_persons_of_feature(train_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd


time: 14.8 ms


In [6]:
def get_wts_from_mat(df, row):
    brow = map(bool, row)
    wts = df.loc[brow, :].apply(sum)
    wts[brow] = 0
    return wts

time: 1.92 ms


In [7]:
def get_similarity_features(file_names): #train_file, added_file):
    train_file = file_names[0]
    added_file = file_names[1]
    train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
    train.fillna(0, inplace=True)
    for col in HEADER[24:]:
        train[col] = train[col].astype(int)
    added_products = pd.read_csv(added_file, usecols=[0])
    jacob_pd, cosine_pd = get_similarity_matrices(train)
    train.set_index('ncodpers', inplace=True)
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = list(train.loc[i, HEADER[24:]])
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[2]

time: 18.2 ms


In [8]:
simi_feat = get_similarity_features('data/train_2015_05_28.csv', 'data/added_product_2015_05_28.csv')

100%|██████████| 41745/41745 [04:41<00:00, 148.17it/s]

time: 4min 45s



