In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from scripts.kaggle.helpers import make_submission
import datetime
from tqdm import tqdm
import gc
import itertools
from multiprocessing import Pool
import pickle
import collections
import math
import random

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

%pylab inline
pylab.rcParams['figure.figsize'] = (12, 16)

%load_ext autotime

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [2]:
HEADER = ["fecha_dato", "ncodpers", "ind_empleado",
          "pais_residencia", "sexo", "age", "fecha_alta",
          "ind_nuevo", "antiguedad", "indrel", "ult_fec_cli_1t",
          "indrel_1mes", "tiprel_1mes", "indresi", "indext",
          "conyuemp", "canal_entrada", "indfall", "tipodom",
          "cod_prov", "nomprov", "ind_actividad_cliente",
          "renta", "segmento", "ind_ahor_fin_ult1",
          "ind_aval_fin_ult1", "ind_cco_fin_ult1",
          "ind_cder_fin_ult1", "ind_cno_fin_ult1",
          "ind_ctju_fin_ult1", "ind_ctma_fin_ult1",
          "ind_ctop_fin_ult1", "ind_ctpp_fin_ult1",
          "ind_deco_fin_ult1", "ind_deme_fin_ult1",
          "ind_dela_fin_ult1", "ind_ecue_fin_ult1",
          "ind_fond_fin_ult1", "ind_hip_fin_ult1",
          "ind_plan_fin_ult1", "ind_pres_fin_ult1",
          "ind_reca_fin_ult1", "ind_tjcr_fin_ult1",
          "ind_valo_fin_ult1", "ind_viv_fin_ult1",
          "ind_nomina_ult1", "ind_nom_pens_ult1",
          "ind_recibo_ult1"]

time: 6.86 ms


In [3]:
def get_persons_of_feature(train):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(train.loc[train[feat] == 1]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 2.58 ms


In [4]:
def jacobian_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    union = len1 + len2 - intersection
    if len1 == 0 or len1 == 0:
        return 0.0
    return intersection * 1.0 / union

def cosine_similarity(persons_of_feature, featur1, featur2):
    feat1 = persons_of_feature[featur1]
    feat2 = persons_of_feature[featur2]
    len1 = len(feat1)
    len2 = len(feat2)
    intersection = len(feat1.intersection(feat2))
    # union = len1 + len2 - intersection
    if len1 == 0 or len2 == 0:
        return 0.0
    return intersection * 1.0 / math.sqrt(len1*len2)

time: 9.33 ms


In [5]:
def get_similarity_matrices(train_df):
    persons_of_feature = get_persons_of_feature(train_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd


time: 9.17 ms


In [6]:
def get_similarity_matrices_from_added(added_df):
    persons_of_feature = get_persons_of_feature_from_added(added_df)
    jacob_matrix = collections.defaultdict(dict)
    cosine_matrix = collections.defaultdict(dict)
    for i in range(24):
        jacob_matrix[i][i] = 1.0
        cosine_matrix[i][i] = 1.0
        for j in range(i + 1, 24):
            temp = jacobian_similarity(persons_of_feature, HEADER[i + 24], HEADER[j+24])
            jacob_matrix[i][j] = temp
            jacob_matrix[j][i] = temp

            temp1 = cosine_similarity(persons_of_feature, HEADER[i+24], HEADER[j+24])
            cosine_matrix[i][j] = temp1
            cosine_matrix[j][i] = temp1
    cosine_pd = pd.DataFrame(cosine_matrix)
    jacob_pd = pd.DataFrame(jacob_matrix)
    return jacob_pd, cosine_pd

def get_persons_of_feature_from_added(added):
    persons_of_feature = dict()
    for feat in HEADER[24:]:
        persons_of_feature[feat] = set(added.loc[added['added_product'] == feat]["ncodpers"])
    return persons_of_feature
# for feat in header[24:]:
#     print feat, len(persons_of_feature[feat])

time: 15.6 ms


In [7]:
def get_wts_from_mat(df, row):
    brow = map(bool, row)
    wts = df.loc[brow, :].apply(sum)
    wts[brow] = 0
    return wts

time: 2.47 ms


In [8]:
def get_similarity_features(file_names): #train_file, added_file):
    train_file = file_names[0]
    added_file = file_names[1]
    train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
    train.fillna(0, inplace=True)
    for col in HEADER[24:]:
        train[col] = train[col].astype(int)
    added_products = pd.read_csv(added_file, usecols=[0])
    jacob_pd, cosine_pd = get_similarity_matrices(train)
    train.set_index('ncodpers', inplace=True)
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added_products['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_' + col] = [0.0]*train_final.shape[0]

    for i in tqdm(train_final.index):
        row = list(train.loc[i, HEADER[24:]])
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_' + h for h in HEADER[24:]]] = list(cwts)
    return train_final, file_names[2]

time: 16.5 ms


In [8]:
simi_feat = get_similarity_features('data/train_2015_05_28.csv', 'data/added_product_2015_05_28.csv')

100%|██████████| 41745/41745 [04:41<00:00, 148.17it/s]

time: 4min 45s





In [13]:
# def get_similarity_features_from_added(file_names): #train_file, added_file):
#     train_file = file_names[0]
#     similarity_file = file_names[1]
#     added_file = file_names[2]
#     train = pd.read_csv(train_file, header=None, names=HEADER[1:2] + HEADER[24:], usecols=[1]+range(24,48))
#     train.fillna(0, inplace=True)
#     for col in HEADER[24:]:
#         train[col] = train[col].astype(int)
#     added_products = pd.read_csv(added_file, usecols=[0])
#     similarity_df = pd.read_csv(similarity_file)
#     jacob_pd, cosine_pd = get_similarity_matrices_from_added(similarity_df)
#     train.set_index('ncodpers', inplace=True)
#     train_final = pd.DataFrame()
#     train_final['ncodpers'] = added_products['ncodpers']
#     train_final.set_index('ncodpers', inplace=True)
#     for col in HEADER[24:]:
#         train_final['jacob_added_' + col] = [0.0]*train_final.shape[0]
#         train_final['cosine_added_' + col] = [0.0]*train_final.shape[0]

#     for i in tqdm(train_final.index):
#         row = list(train.loc[i, HEADER[24:]])
#         jwts = get_wts_from_mat(jacob_pd, row)
#         cwts = get_wts_from_mat(cosine_pd, row)
#         train_final.loc[i, ['jacob_added_' + h for h in HEADER[24:]]] = list(jwts)
#         train_final.loc[i, ['cosine_added_' + h for h in HEADER[24:]]] = list(cwts)
#     return train_final, file_names[3], jacob_pd, cosine_pd

time: 21.3 ms


In [17]:
tdf, ind, j_pd, c_pd = get_similarity_features_from_added(('data/train_2015_05_28.csv', 'data/added_product_2015_04_28.csv', 'data/added_product_2015_05_28.csv', 1))

100%|██████████| 41745/41745 [05:25<00:00, 128.37it/s]

time: 5min 27s





---
# added in the last two months data set computation
---

In [11]:
def get_file_string_from_train_data(train_data):
    date_time = datetime.datetime(train_data[0], train_data[1], 28)
    return '_'.join(str(date_time.date()).split('-'))

def get_date_string_from_train_data(train_data):
    date_time = datetime.datetime(train_data[0], train_data[1], 28)
    return '-'.join(str(date_time.date()).split('-'))

def subtract_one_month(train_data):
    if train_data[1] == 1:
        return 2015, 12
    else:
        return train_data[0], train_data[1] - 1
    
som = subtract_one_month



def added_in_the_last_2_months(date, added_df):
    """
    date = (2015, 5)
    """
    
    print date
#     added_df = pd.read_csv('data/added_in_last_2_months.csv')
    current_month = get_file_string_from_train_data(date)
    last_month = get_file_string_from_train_data(som(date))
    last_last_month = get_date_string_from_train_data(som(som(date)))
    
    current_date = get_date_string_from_train_data(date)
    
    if date[:2] == (2016, 5):
        added = pd.read_csv('data/test_ver2.csv', usecols=[1])
    else:
        added = pd.read_csv('data/added_product_' + current_month + '.csv', usecols=[0])
    
    similarity_df = pd.read_csv('data/added_product_' + last_month + '.csv')
    jacob_pd, cosine_pd = get_similarity_matrices_from_added(similarity_df)
    
    train_final = pd.DataFrame()
    train_final['ncodpers'] = added['ncodpers']
    train_final.set_index('ncodpers', inplace=True)
    for col in HEADER[24:]:
        train_final['jacob_added_2_old_' + col] = [0.0]*train_final.shape[0]
        train_final['cosine_added_2_old_' + col] = [0.0]*train_final.shape[0]
    temp_added_df = added_df.loc[added_df.fecha_dato == current_date, :]
    temp_added_df.set_index('ncodpers', inplace=True)
    for i in tqdm(train_final.index):
#         row = get_added_row(date, i, added_df)
        try:
            row = temp_added_df.loc[i, HEADER[24:]].values
        except KeyError:
            row = [0]*24
        
        jwts = get_wts_from_mat(jacob_pd, row)
        cwts = get_wts_from_mat(cosine_pd, row)
        train_final.loc[i, ['jacob_added_2_old_' + h for h in HEADER[24:]]] = list(jwts)
        train_final.loc[i, ['cosine_added_2_old_' + h for h in HEADER[24:]]] = list(cwts)
#     train_final.fillna()
    del added_df
    gc.collect()
    return train_final, date[2]
        
def get_added_row(date, ncod, added_df):
    current_month = get_date_string_from_train_data(date)
    last_month = get_date_string_from_train_data(som(date))
    last_last_month = get_date_string_from_train_data(som(som(date)))
    
#     return [random.randint(0,1) for x in range(24)]

    added_row = added_df.loc[(added_df['fecha_dato'] == current_month) & (added_df['ncodpers']==ncod), HEADER[24:]].values
    if len(added_row) == 0:
        return [0]*24
    else:
        return added_row[0]
    
#     current_subs = main_train.loc[(main_train['fecha_dato'] == current_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
    
#     two_months_old = main_train.loc[(main_train['fecha_dato'] == last_last_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
#     if len(two_months_old) == 0:
#         two_months_old = main_train.loc[(main_train['fecha_dato'] == last_month) & (main_train['ncodpers'] == ncod), HEADER[24:]]
#         if len(two_months_old) == 0:
#             return [0]*24
#     added_row = [(t^r) & r for t, r in zip(two_months_old.values[0], current_subs.values[0])]
#     return added_row

    
    

time: 86.3 ms


In [34]:
def f(x):
    x.set_index('fecha_dato', inplace=True)
    ncod =  x['ncodpers'][0]
    temp = pd.DataFrame()
    temp['fecha_dato'] = x.index[2:]
    temp['ncodpers'] = [int(x.ncodpers[0])]*temp.shape[0]
    for col in HEADER[24:]:
        temp[col] = [0]*temp.shape[0]
#     print temp
    for i in temp.index:
        cur_date = temp.loc[i, 'fecha_dato']
#         print cur_date, 'cur date'
        old_date = get_date_string_from_train_data(som(som(tuple(map(int, cur_date.split('-')[:2])))))
#         print old_date, 'old date'
        cur = list(x.loc[cur_date, HEADER[24:]])
        try:
            old = list(x.loc[old_date, HEADER[24:]])
        except KeyError:
            # print ncod, 'keyerror'
            old = cur
        temp.loc[i, HEADER[24:]] = is_added(old, cur)
#         print cur, 'cur'
#         print is_added(old,cur), 'isadded'
    return temp

def is_added(old, cur):
    old = map(int, old)
    cur = map(int, cur)
    return [(o^c)& c for o, c in zip(old,cur)]

time: 20.9 ms


In [18]:
sorted_tr = pd.read_csv('data/sorted_train.csv', usecols=range(2)+range(24,48))
sorted_tr.fillna(0, inplace=True)
for col in HEADER[24:]:
    sorted_tr[col] = sorted_tr[col].astype(int)

g = sorted_tr.groupby('ncodpers')

gh = sorted_tr.head(200).groupby('ncodpers')

from joblib import Parallel, delayed
import multiprocessing


def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)




time: 30.1 s


In [40]:
x = applyParallel(g, f)

time: 2h 45min 44s


In [42]:
for col in ['ncodpers']+HEADER[24:]:
    x[col] = x[col].astype(int)

x.to_csv('data/added_in_last_2_months.csv', index=False)

make_submission('data/added_in_last_2_months.csv', upload_to_s3=True)

time: 12.6 s


In [10]:
added_df = pd.read_csv('data/added_in_last_2_months.csv')

time: 20.6 s


In [79]:
tr_df, ind = added_in_the_last_2_months((2015, 5, 1), added_df)

100%|██████████| 41745/41745 [02:25<00:00, 287.75it/s]

time: 2min 26s





In [102]:
class StateLoader(object):
    def __init__(self, added_df):
        self.added_df = added_df
    def __call__(self, date):
        return added_in_the_last_2_months(date, self.added_df)

time: 4.23 ms


In [13]:
dates = [(2015, 5, 1), (2015, 6, 2), (2015, 7, 3), (2015, 8, 4), (2015, 9, 5), (2015, 10, 6), (2015, 11, 7), (2015, 12, 8),
         (2016, 1, 9), (2016, 2, 10), (2016, 3, 11), (2016, 4, 12)]

time: 2.04 ms


In [14]:
# p = Pool(3)
results = map(lambda x: added_in_the_last_2_months(x, added_df), dates)
# p.close()
# p.join()

(2015, 5, 1)


100%|██████████| 41745/41745 [02:24<00:00, 289.63it/s]


(2015, 6, 2)


100%|██████████| 33362/33362 [01:59<00:00, 279.75it/s]


(2015, 7, 3)


100%|██████████| 29490/29490 [01:43<00:00, 284.14it/s]


(2015, 8, 4)


100%|██████████| 35696/35696 [02:01<00:00, 293.68it/s]


(2015, 9, 5)


100%|██████████| 39803/39803 [02:15<00:00, 294.00it/s]


(2015, 10, 6)


100%|██████████| 35215/35215 [02:01<00:00, 290.03it/s]


(2015, 11, 7)


100%|██████████| 42353/42353 [02:25<00:00, 290.42it/s]


(2015, 12, 8)


100%|██████████| 30956/30956 [01:48<00:00, 284.25it/s]


(2016, 1, 9)


100%|██████████| 49024/49024 [02:53<00:00, 282.06it/s]


(2016, 2, 10)


100%|██████████| 35203/35203 [02:00<00:00, 292.34it/s]


(2016, 3, 11)


100%|██████████| 32962/32962 [01:56<00:00, 283.20it/s]


(2016, 4, 12)


100%|██████████| 35843/35843 [02:04<00:00, 288.47it/s]

time: 25min 45s





In [117]:
gc.collect()

161

time: 198 ms


In [16]:
jacob_added_in_the_last_2_train = pd.concat(tuple([x[0] for x in results]))
jacob_added_in_the_last_2_train.shape

(441652, 48)

time: 55.1 ms


In [17]:
pickle.dump(jacob_added_in_the_last_2_train, open('data/15.Jacobian_similarity_features_added_in_last_2_months.train_data', 'wb'))
make_submission('data/15.Jacobian_similarity_features_added_in_last_2_months.train_data', upload_to_s3=True)

Compressing file
Compression done
Uploading file to s3 data/15.Jacobian_similarity_features_added_in_last_2_months.train_data.zip
uploading to s3 done
time: 23.1 s


In [18]:
jacob_added_in_the_last_2_test, ind = added_in_the_last_2_months((2016, 5, 1), added_df)

(2016, 5, 1)


100%|██████████| 929615/929615 [48:16<00:00, 320.89it/s]

time: 48min 20s





In [19]:
pickle.dump(jacob_added_in_the_last_2_test, open('data/15.Jacobian_similarity_features_added_in_last_2_months.test_data', 'wb'))
make_submission('data/15.Jacobian_similarity_features_added_in_last_2_months.test_data', upload_to_s3=True)

Compressing file
Compression done
Uploading file to s3 data/15.Jacobian_similarity_features_added_in_last_2_months.test_data.zip
uploading to s3 done
time: 56.1 s


In [20]:
description, train, added_products, label_encoder, encoders = pickle.load(open('../old_machine/home/ubuntu/santander/data/train_data.train_data', 'rb'))

time: 29.8 s


In [21]:
jacob_added_in_the_last_2_test.head()

Unnamed: 0_level_0,jacob_added_2_old_ind_ahor_fin_ult1,cosine_added_2_old_ind_ahor_fin_ult1,jacob_added_2_old_ind_aval_fin_ult1,cosine_added_2_old_ind_aval_fin_ult1,jacob_added_2_old_ind_cco_fin_ult1,cosine_added_2_old_ind_cco_fin_ult1,jacob_added_2_old_ind_cder_fin_ult1,cosine_added_2_old_ind_cder_fin_ult1,jacob_added_2_old_ind_cno_fin_ult1,cosine_added_2_old_ind_cno_fin_ult1,...,jacob_added_2_old_ind_valo_fin_ult1,cosine_added_2_old_ind_valo_fin_ult1,jacob_added_2_old_ind_viv_fin_ult1,cosine_added_2_old_ind_viv_fin_ult1,jacob_added_2_old_ind_nomina_ult1,cosine_added_2_old_ind_nomina_ult1,jacob_added_2_old_ind_nom_pens_ult1,cosine_added_2_old_ind_nom_pens_ult1,jacob_added_2_old_ind_recibo_ult1,cosine_added_2_old_ind_recibo_ult1
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15889,,,,,,,,,,,...,,,,,,,,,,
1170544,,,,,,,,,,,...,,,,,,,,,,
1170545,,,,,,,,,,,...,,,,,,,,,,
1170547,,,,,,,,,,,...,,,,,,,,,,
1170548,,,,,,,,,,,...,,,,,,,,,,


time: 34.4 ms
