In [1]:
# data manipulation
import numpy as np
import pandas as pd
import json
from scipy import sparse

In [2]:
# data pipeline
from sklearn.model_selection import StratifiedKFold, KFold

In [3]:
# metrics
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import roc_curve

In [4]:
# utils
from collections import Counter
import matplotlib.pyplot as plt




In [53]:
# preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import igraph

In [6]:
# models

from sklearn.neighbors import KNeighborsClassifier

In [7]:
# hardcore models
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

<hr>

## Data loading and other useful stuffs

In [8]:
data_train = pd.read_json("data/train_dataset.jl", lines=True)

In [9]:
%%time
for sess in data_train["user_history"].values:
    for event in sess:
        event["event_timestamp"] = np.datetime64(event["event_timestamp"])

  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 24 s


In [10]:
data_items = pd.read_json("data/item_data.jl", lines=True)

In [11]:
data_test = pd.read_json("data/test_dataset.jl", lines=True)

In [12]:
%%time
for sess in data_test["user_history"].values:
    for event in sess:
        event["event_timestamp"] = np.datetime64(event["event_timestamp"])

  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 11.4 s


In [13]:
# item features

In [14]:
data_items.set_index("item_id", inplace = True)

In [15]:
data_items["domain_id"].nunique()

7893

In [16]:
data_items["price"].isna().mean()

0.00016125372631675084

In [17]:
data_items_price = data_items["price"]
data_items_price.dropna(inplace = True)

In [19]:
itemPrice = data_items_price.to_dict()        

itemDomain = data_items["domain_id"].to_dict()

for i, d in itemDomain.items():
    if d is None:
        itemDomain[i] = "<UNKN>"

itemCondition = data_items["condition"].map({"new" : 1, "used" : 0, None : -1}).to_dict()

for i, d in itemCondition.items():
    if d is None:
        print(a)

In [20]:
uhs_train = data_train["user_history"].values
target = data_train["item_bought"].values
uhs_test= data_test["user_history"].values

In [21]:
domain = np.vectorize(itemDomain.get)(target)

In [22]:
domainCode = dict()
for i, dom in enumerate(set(domain)):
    domainCode[dom] = i

In [23]:
pd.Series(domain).value_counts(normalize=True)

MLB-CELLPHONES                       0.060678
MLB-SNEAKERS                         0.035357
MLB-SUPPLEMENTS                      0.023143
MLB-HEADPHONES                       0.021911
MLB-SMARTWATCHES                     0.019273
                                       ...   
MLM-TONER_REFILLS                    0.000002
MLM-MOTORCYCLE_TURN_SIGNAL_LIGHTS    0.000002
MLM-WASTE_BASKETS                    0.000002
MLB-GARMENT_COVERS                   0.000002
MLM-SOLDERING_MACHINES               0.000002
Length: 3214, dtype: float64

In [24]:
most_bought_domain = pd.Series(domain).value_counts().idxmax()

In [25]:
most_bought_domain

'MLB-CELLPHONES'

In [26]:
topDomainTop20itens = list(pd.Series(target[domain == most_bought_domain]).value_counts(ascending=False).index[:20])

In [27]:
domItemFreq = dict()
for i, d in zip(target, domain):
    domItemFreq[d] = domItemFreq.get(d, list()) + [i]

In [28]:
top20domItems = dict()
for d, ilist in domItemFreq.items():
    top20domItems[d] = list(pd.Series(ilist).value_counts().index[:20])

In [29]:
itemAsLabelCount = pd.Series(target).value_counts().to_dict()
most_sold = pd.Series(itemAsLabelCount).idxmax()

In [30]:
itemDomain[most_sold]

'MLB-HEADPHONES'

In [31]:
topDomainTop10itens = list(pd.Series(target[domain == most_bought_domain]).value_counts(ascending=False).index[:10])

In [32]:
topDomainTop10itens

[859574,
 1371799,
 119703,
 1332849,
 882697,
 1098739,
 98853,
 790888,
 967194,
 1595373]

In [33]:
cumsum = pd.Series(target).value_counts(normalize = True).cumsum()

In [34]:
%%time

queries = []
viewed_items = []
viewed_domains = []
num_queries = []
for session in uhs_train:
    session_viewed_items = dict()
    session_viewed_domains = dict()
    session_queries = []
    s = 0
    for event in session:
        if event["event_type"] == "view":
            item_seen = event["event_info"]
            session_viewed_items[item_seen] = session_viewed_items.get(item_seen, 0) + 1
            domain_seen = itemDomain[item_seen]
            domain_seen = "<UKNW>" if domain_seen is None else domain_seen
            session_viewed_domains[domain_seen] = session_viewed_domains.get(domain_seen, 0) + 1
        else:
            session_queries.append(event["event_info"])
            s += 1
    viewed_items.append(session_viewed_items)
    viewed_domains.append(session_viewed_domains)
    queries.append(" ".join(session_queries))
    num_queries.append(s)

Wall time: 7.57 s


In [35]:
sessions_length = np.vectorize(len)(uhs_train)

In [36]:
num_viewed = np.vectorize(len)(viewed_items)

In [37]:
num_queries = np.array(num_queries)

In [38]:
top1item = []
isTopItem = np.zeros(len(target), int)
containItemLabel = np.zeros(len(target), int)
containDomainLabel = np.zeros(len(target), int)
for i, (session, sessDoms, label, labelDom) in enumerate(zip(viewed_items, viewed_domains, target, domain)):
    if len(session) > 0:
        most = max(session, key = lambda k : session[k])
    else:
        most = -1
        
    labelDom
    top1item.append(most)
    isTopItem[i] = 1 if most == label else 0
    containItemLabel[i] = 1 if label in session else 0
    containDomainLabel[i] = 1 if labelDom in sessDoms else 0

In [39]:
containItemLabel.mean(), containDomainLabel.mean()

(0.29388401187908886, 0.49340575027289474)

In [40]:
accuracy_score(target, top1item)

0.1568267245614927

<hr>

## Loading generated candidates

In [41]:
# load knn results from files and filter rec ammount

In [42]:
NN = 20
inds_folds = []
recomms_folds = []
dists_folds = []
for j in range(5):
    temp_inds = pd.read_csv("data/knn/inds_%d.csv" % j, header = None).values
    temp_recomms = pd.read_csv("data/knn/recomms_%d.csv" % j, header = None).values
    temp_dists = pd.read_csv("data/knn/dists_%d.csv" % j, header = None).values
    inds_folds.append(temp_inds[:, :NN])
    recomms_folds.append(temp_recomms[:, :NN])
    dists_folds.append(temp_inds[:, :NN])

<hr>

## Functions to add features to the ranker

In [43]:
def get_vista_compra(uhs_train, target):
    vista_compra = dict()
    for session, label in zip(uhs_train, target):
        unique_items = set(event["event_info"] for event in session if event["event_type"] == "view")
        for item in unique_items:
            if item not in vista_compra:
                vista_compra[item] = dict()
            vista_compra[item][label] = vista_compra[item].get(label, 0) + 1
    return vista_compra

def get_vista_compra_power(uhs_train, target):
    vista_compra = dict()
    for session, label in zip(uhs_train, target):
        unique_items = set(event["event_info"] for event in session if event["event_type"] == "view")
        for item in unique_items:
            if item not in vista_compra:
                vista_compra[item] = dict()
            vista_compra[item][label] = vista_compra[item].get(label, 0) + np.log(len(unique_items))
    return vista_compra

def get_vista_compra_l10(uhs_train, target):
    vista_compra = dict()
    for session, label in zip(uhs_train, target):
        view_events = [event for event in session if event["event_type"] == "view"]
        unique_items = set()
        unique_order = []
        for event in view_events[::-1]:
            item_seen = event["event_info"]
            if item_seen not in unique_items:
                unique_items.add(item_seen)
                unique_order.append(item_seen)
            if len(unique_order) >= 10:
                break
        
        for item in unique_items:
            if item not in vista_compra:
                vista_compra[item] = dict()
            vista_compra[item][label] = vista_compra[item].get(label, 0) + 1
    return vista_compra

def get_vista_count(uhs_train):
    vista_count = dict()
    for session in uhs_train:
        unique_items = set(event["event_info"] for event in session if event["event_type"] == "view")
        for item in unique_items:
            vista_count[item] = vista_count.get(item, 0) + 1
    return vista_count

def get_compra_count_power(uhs_train, target):
    compra_count = dict()
    for session, label in zip(uhs_train, target):
        unique_items = set(event["event_info"] for event in session if event["event_type"] == "view")
        if label in unique_items:
            compra_count[label] = compra_count.get(label, 0) + 1 + np.log(len(unique_items))
    return compra_count

In [179]:
def get_graphdata(uhs):
    graph_data = []
    
    for session in uhs:
        
        graph = igraph.Graph(directed = True)
        
        item_seq = [event["event_info"] for event in session if event["event_type"] == "view"]
        # lets start with just items
        
        sess_code = 0 # counter to name itens in sess code
        visited = dict() # stores sess codes
        stml = dict()
        item_seq_sess = [] # the actual sequence in sess code
        for item in item_seq:
            if item not in visited:
                visited[item] = sess_code
                stml[sess_code] = item
                sess_code += 1
                
            item_seq_sess.append(visited[item])
            
        source_nodes = item_seq_sess[:-1]
        target_nodes = item_seq_sess[1: ]
        edges = [(ns, nt) for ns, nt in zip(source_nodes, target_nodes)]
        graph.add_vertices(len(visited))
        graph.add_edges(edges)
        
        last = item_seq_sess[-1] if len(item_seq_sess) > 0 else None
        
        graph_data.append((graph, stml, visited, last))
        
    return  graph_data

In [180]:
%%time
graph_data_full = get_graphdata(uhs_train)

Wall time: 10.5 s


In [309]:
def get_sess_node_features(graph_data):
    sess_node_features = []

    for sess_id, (graph, stml, mlts, last) in enumerate(graph_data):

        graph_und = graph.copy()
        graph_und.to_undirected()

        features_list = [
            graph.indegree(),
            graph.outdegree(),
            graph.betweenness(),
            graph.pagerank(),
            graph_und.eigenvector_centrality(),
            list(np.array(graph_und.shortest_paths_dijkstra(target = last)).ravel()),
            graph_und.closeness(),
            
        ]

        features = np.column_stack(features_list)
        if features.shape != (len(stml), len(features_list)):
            print(features)

        node_features = dict()
        for i, f in enumerate(features):
            node_features[stml[i]] = f

        sess_node_features.append(node_features)
    
    return sess_node_features

In [310]:
%%time
sess_node_features_full = np.array(get_sess_node_features(graph_data_full))

Wall time: 1min 5s


In [311]:
num_gfeatures = len(next(iter(sess_node_features_full[0].values()))) # might not always work
num_gfeatures

7

In [312]:
def get_item_ds(uhs_train, vista_compra, vista_count, compra_count, domain_count,
                knnrecs, knndists, session_id, node_features_list):
    item_ds = []
    for sess_id, session, sess_knnrecs, sess_knndists, node_features in zip(session_id, X_test,
                                                                            knnrecs, knndists, node_features_list):

        unique_recs = []
        unique_recsdist = dict()
        unique_recsCount = dict()
        negOnes = 0
        for r, d in zip(sess_knnrecs, sess_knndists):
            if r != -1:
                if r not in unique_recsdist:
                    unique_recs.append(r)
                    unique_recsdist[r] = d
                    unique_recsCount[r]  = 1
                else:
                    unique_recsCount[r]  += 1
            else:
                negOnes += 1
        negOnetop = sess_knnrecs[0] == -1
        
        assert len(unique_recs) == len(unique_recsdist), "repeated knn recomms!"
                
        view_events = [event for event in session if event["event_type"] == "view"]
        
        num_queries = len(session) - len(view_events)
        unique_set = set()
        count = dict()
        unique_order = []
        insess_seq = dict()
        visited_twice = set()
        visited_twice_list = []
        special_selected = []
        for i, event in enumerate(view_events[::-1]):
            item_seen = event["event_info"]
            item = item_seen
            if item_seen not in unique_set:
                unique_set.add(item_seen)
                unique_order.append(item_seen)
            elif item not in visited_twice:
                visited_twice.add(item)
                visited_twice_list.append(item)
            count[item_seen] = count.get(item_seen, 0) + 1
            insess_seq[item_seen] = i

        sel_items = unique_order[:10]
        
        last_repeated = None
        if len(visited_twice_list) > 0:
            last_repeated = visited_twice_list[0]
        
        final_score = dict()
        final_scoreDom = dict()
        for vista in unique_order[:20]:
            for compra, qtd in vista_compra.get(vista, dict()).items():
                dom = itemDomain.get(compra, "<UNKN>")
                final_score[compra] = final_score.get(compra, 0) + qtd
                final_scoreDom[dom] = final_scoreDom.get(dom, 0) + qtd
        
        best = None
        best_dom = None
        most_seen = None
        last_seen = None # last_seen if best was not last
        pred_set = {}
        pred = []
        if len(sel_items) > 0:        
            best = max(sel_items, key = lambda k : (compra_count.get(k, 0), itemCondition.get(k, 0)))
            best_dom = itemDomain[best] # Careful with this line -> might need to change in the future
            pred = [best]
            pred_set = {best}

            count2 = count.copy()
            for p in pred:
                if p in count2:
                    count2.pop(p)    
            if len(count2) > 0:
                most_seen = max(count2, key = lambda k : (count2.get(k, 0), compra_count.get(k, 0)))
                pred = pred + [most_seen]
                pred_set.add(most_seen)


            for item in sel_items:
                if item not in pred_set and itemDomain[item] == best_dom:
                    last_seen = item
                    pred = pred + [last_seen]
                    pred_set.add(last_seen)
                break
                
        # get all suggestions:
        unique_recs_filter = [item for item in unique_recs if item not in pred_set]
        candids = pred + unique_recs_filter
        
        assert len(candids) == len(set(candids)), "repeated candidates!"

        global num_gfeatures
        # for each suggested item, get its features
        block = []
        for item in candids:
#             vcomp = vista_compra.get(item, dict()).get(item, 0)
            icdtion = itemCondition.get(item, -1)
            if icdtion not in {0, 1, -1}:
                icdtion = -1
            ifeats = [
                sess_id,
                item,
                # 2
                item == best,       # is best
                item == most_seen,  # is most viewed
                item == last_seen,  # is last
                insess_seq.get(item, len(session)),  # sess seq
                count.get(item, 0), # view count
                unique_recsdist.get(item, -1),       # dist
                itemPrice.get(item, -1),             # price
                compra_count.get(item, 0),           # compra count
                vista_count.get(item, 0),            # vista count
                unique_recsCount.get(item, 0),       # knn rec count
                # 12
                num_queries,
                itemDomain.get(item, -1) == best_dom, #
                itemDomain.get(item, -1) == itemDomain.get(most_seen, -2), #
                final_score.get(item, 0),
                icdtion,
                negOnes,
                negOnetop,
                item == last_repeated,
             ]
            ifeats.extend(list(node_features.get(item, -np.ones(num_gfeatures))))
            block.append(ifeats)


        # get session features

        # get item-session features

        item_ds.extend(block)

    return np.array(item_ds)

In [313]:
def get_extra_ids_features(itemDataSet):
    ids_df = pd.DataFrame(itemDataSet)
    
    # skip col 12
    item_cols = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   13, 14, 15, 16, 17, 18,   20, 21, 22, 23, 
                 24, 25, 26
                ]
    id_sess_max = ids_df.groupby(0)[item_cols].max()
    id_sess_min = ids_df.groupby(0)[item_cols].min()
    id_sess_mean = ids_df.groupby(0)[item_cols].mean()
    # id_sess_nu = ids_df.groupby(0)[item_cols].nunique()
    id_sess_len = ids_df.groupby(0).size()

    id_sess_max.columns = [i + 50 for i in item_cols]
    id_sess_min.columns = [i + 100 for i in item_cols]
    id_sess_mean.columns = [i + 150 for i in item_cols]
    # id_sess_nu.columns = [i + 200 for i in item_cols]
    id_sess_len.rename(49, inplace=True);

    temp = pd.concat([
        id_sess_len,
        id_sess_max,
        id_sess_mean,
    #     id_sess_nu,
        id_sess_min
    ], axis = 1)
    ids_df = pd.merge(ids_df, temp, on=0)

    # for i in item_cols:
    #     # just do it for max -> min might have too many zeros
    #     ids_df[i + 50 + 100] = ids_df[i] / ( ids_df[i + 50] + 1)
    #     ids_df[i + 50 + 100].fillna(-1, inplace = True)

    # for i in item_cols:
    #     # this helps AUC, but in practice no better ranking
    #     tempdict = pd.Series(ids_df[i]).value_counts().to_dict()
    #     ids_df[i + 500] = np.vectorize(tempdict.get)(ids_df[i])
    return ids_df.values

<hr>

## Running the ranker on 10% of data and validating on the other 10%
(this is usefull for simplifying the tunning process)

In [45]:
splitter = KFold(n_splits = 5, shuffle = False)
splits = list(splitter.split(uhs_train, target, containItemLabel))

In [314]:
session_id = np.arange(len(uhs_train))

In [315]:
%%time
temp_yfolds = []
temp_Xfolds = []
for j, (train_index, test_index) in enumerate(splits):
    print(j)
    
    recomms_test = recomms_folds[j]
    knndists_test = dists_folds[j]
    
    
    X_train, y_train, ydom_train = uhs_train[train_index], target[train_index], domain[train_index]
    X_test, y_test, ydom_test = uhs_train[test_index], target[test_index], domain[test_index]
    session_id_test = session_id[test_index]
    sess_node_features_test = sess_node_features_full[test_index]
    
    print("splitted")

    vista_compra_train = get_vista_compra_l10(X_train, y_train) #l10 here improves dom pos2
    vista_count_train = get_vista_count(X_train)
    compra_count_train = pd.Series(y_train).value_counts()
    #     compra_count_train = get_compra_count_power(X_train, y_train)
    domain_count_train = pd.Series(ydom_train).value_counts()
    
    print("X_train processed")
    
    item_ds_test = get_item_ds(X_test,
        vista_compra_train, vista_count_train, compra_count_train, domain_count_train,
                               recomms_test, knndists_test, session_id_test, sess_node_features_test)
    
    print("Got ids")
    
    ids_real_targets = target[item_ds_test[:, 0].astype(int)]

    fold_y = 1 * (ids_real_targets == item_ds_test[:, 1].astype(int))
    
    fold_X = get_extra_ids_features(item_ds_test)
    
    temp_yfolds.append(fold_y)
    temp_Xfolds.append(fold_X)
    
    break

0
splitted
X_train processed
Got ids
Wall time: 48.4 s


In [316]:
X = np.concatenate(temp_Xfolds, axis = 0)
y = np.concatenate(temp_yfolds, axis = 0)

In [317]:
zp = int(len(X_test) * .5)
zp

41316

In [318]:
wp = -1
initial_sess_id = X[0, 0]
for i, sess_id in enumerate(X[:, 0]):
    if (sess_id - initial_sess_id) == zp:
        wp = i

In [319]:
ids_X_train, ids_y_train, ids_item_train, ids_sess_train = X[:wp,2:], y[:wp], X[:wp, 1], X[:wp, 0]
ids_X_test, ids_y_test, ids_item_test, ids_sess_test = X[wp:,2:], y[wp:], X[wp:, 1], X[wp:, 0]

In [320]:
clf_xgb = xgb.XGBClassifier(
    n_estimators = 50, # 30 wo extra features
    max_depth = 6,
    missing = -1,
    verbosity = 1,
    min_child_weight = 15,
#     objective='binary:logistic'
#     reg_alpha = 2,
#     reg_lambda = 8,
    
)
clf_xgb.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
    eval_metric="auc", verbose=True,
#     sample_weight = ids_sw_train,
)

[0]	validation_0-auc:0.93227	validation_1-auc:0.92755
[1]	validation_0-auc:0.93350	validation_1-auc:0.92830
[2]	validation_0-auc:0.93474	validation_1-auc:0.92875
[3]	validation_0-auc:0.93741	validation_1-auc:0.93177
[4]	validation_0-auc:0.93996	validation_1-auc:0.93463
[5]	validation_0-auc:0.94082	validation_1-auc:0.93554
[6]	validation_0-auc:0.94167	validation_1-auc:0.93638
[7]	validation_0-auc:0.94207	validation_1-auc:0.93672
[8]	validation_0-auc:0.94364	validation_1-auc:0.93785
[9]	validation_0-auc:0.94425	validation_1-auc:0.93821
[10]	validation_0-auc:0.94473	validation_1-auc:0.93856
[11]	validation_0-auc:0.94531	validation_1-auc:0.93880
[12]	validation_0-auc:0.94578	validation_1-auc:0.93926
[13]	validation_0-auc:0.94623	validation_1-auc:0.93945
[14]	validation_0-auc:0.94678	validation_1-auc:0.93980
[15]	validation_0-auc:0.94784	validation_1-auc:0.94024
[16]	validation_0-auc:0.94834	validation_1-auc:0.94044
[17]	validation_0-auc:0.94887	validation_1-auc:0.94057
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=15, missing=-1, monotone_constraints='()',
              n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=1)

In [353]:
clf_cat = cat.CatBoostClassifier(
    eval_metric='AUC',
    metric_period = 10,
)
clf_cat.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
#     eval_metric="auc", verbose=True,
)

Learning rate set to 0.152175
0:	test: 0.8608368	test1: 0.8547883	best: 0.8547883 (0)	total: 146ms	remaining: 2m 25s
10:	test: 0.9242070	test1: 0.9194293	best: 0.9194293 (10)	total: 1.09s	remaining: 1m 37s
20:	test: 0.9301891	test1: 0.9253986	best: 0.9253986 (20)	total: 2.03s	remaining: 1m 34s
30:	test: 0.9369064	test1: 0.9324935	best: 0.9324935 (30)	total: 2.99s	remaining: 1m 33s
40:	test: 0.9390662	test1: 0.9340278	best: 0.9340278 (40)	total: 3.94s	remaining: 1m 32s
50:	test: 0.9404391	test1: 0.9351637	best: 0.9351637 (50)	total: 4.89s	remaining: 1m 30s
60:	test: 0.9417875	test1: 0.9363305	best: 0.9363305 (60)	total: 5.85s	remaining: 1m 30s
70:	test: 0.9431764	test1: 0.9375978	best: 0.9375978 (70)	total: 6.82s	remaining: 1m 29s
80:	test: 0.9438618	test1: 0.9381276	best: 0.9381276 (80)	total: 7.79s	remaining: 1m 28s
90:	test: 0.9445166	test1: 0.9386119	best: 0.9386119 (90)	total: 8.76s	remaining: 1m 27s
100:	test: 0.9448540	test1: 0.9387626	best: 0.9387626 (100)	total: 9.7s	remaining:

910:	test: 0.9620550	test1: 0.9425947	best: 0.9426856 (850)	total: 1m 27s	remaining: 8.55s
920:	test: 0.9622165	test1: 0.9425736	best: 0.9426856 (850)	total: 1m 28s	remaining: 7.59s
930:	test: 0.9623265	test1: 0.9425558	best: 0.9426856 (850)	total: 1m 29s	remaining: 6.63s
940:	test: 0.9624604	test1: 0.9425596	best: 0.9426856 (850)	total: 1m 30s	remaining: 5.67s
950:	test: 0.9625502	test1: 0.9425503	best: 0.9426856 (850)	total: 1m 31s	remaining: 4.71s
960:	test: 0.9627184	test1: 0.9425623	best: 0.9426856 (850)	total: 1m 32s	remaining: 3.75s
970:	test: 0.9629260	test1: 0.9425836	best: 0.9426856 (850)	total: 1m 33s	remaining: 2.79s
980:	test: 0.9630524	test1: 0.9425749	best: 0.9426856 (850)	total: 1m 34s	remaining: 1.82s
990:	test: 0.9631656	test1: 0.9425669	best: 0.9426856 (850)	total: 1m 35s	remaining: 865ms
999:	test: 0.9632802	test1: 0.9425673	best: 0.9426856 (850)	total: 1m 36s	remaining: 0us

bestTest = 0.9426856065
bestIteration = 850

Shrink model to first 851 iterations.


<catboost.core.CatBoostClassifier at 0x2a73d567548>

In [335]:
clf_lgb = lgb.LGBMClassifier(
    n_estimators=200,
    min_child_weight=10
)
clf_lgb.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
    eval_metric="auc", verbose=True,
#     early_stopping_rounds=20,
)

[1]	training's auc: 0.902065	training's binary_logloss: 0.0871024	valid_1's auc: 0.89739	valid_1's binary_logloss: 0.0876391
[2]	training's auc: 0.924077	training's binary_logloss: 0.0823455	valid_1's auc: 0.919233	valid_1's binary_logloss: 0.0830117
[3]	training's auc: 0.933659	training's binary_logloss: 0.0789373	valid_1's auc: 0.928893	valid_1's binary_logloss: 0.0796883
[4]	training's auc: 0.935012	training's binary_logloss: 0.0762547	valid_1's auc: 0.930263	valid_1's binary_logloss: 0.0771027
[5]	training's auc: 0.936434	training's binary_logloss: 0.0740833	valid_1's auc: 0.93198	valid_1's binary_logloss: 0.0749888
[6]	training's auc: 0.936936	training's binary_logloss: 0.0722753	valid_1's auc: 0.932489	valid_1's binary_logloss: 0.0732376
[7]	training's auc: 0.937493	training's binary_logloss: 0.0707986	valid_1's auc: 0.933117	valid_1's binary_logloss: 0.0718025
[8]	training's auc: 0.938006	training's binary_logloss: 0.0695214	valid_1's auc: 0.93357	valid_1's binary_logloss: 0.070

[66]	training's auc: 0.952837	training's binary_logloss: 0.0582602	valid_1's auc: 0.942781	valid_1's binary_logloss: 0.0619296
[67]	training's auc: 0.952923	training's binary_logloss: 0.0582125	valid_1's auc: 0.942828	valid_1's binary_logloss: 0.0619192
[68]	training's auc: 0.953036	training's binary_logloss: 0.0581583	valid_1's auc: 0.942854	valid_1's binary_logloss: 0.0619079
[69]	training's auc: 0.953178	training's binary_logloss: 0.0581115	valid_1's auc: 0.942856	valid_1's binary_logloss: 0.0619104
[70]	training's auc: 0.953257	training's binary_logloss: 0.0580644	valid_1's auc: 0.942859	valid_1's binary_logloss: 0.0619038
[71]	training's auc: 0.953332	training's binary_logloss: 0.0580266	valid_1's auc: 0.94289	valid_1's binary_logloss: 0.0618952
[72]	training's auc: 0.95342	training's binary_logloss: 0.0579871	valid_1's auc: 0.942877	valid_1's binary_logloss: 0.0618942
[73]	training's auc: 0.95351	training's binary_logloss: 0.0579368	valid_1's auc: 0.942909	valid_1's binary_loglos

[195]	training's auc: 0.961741	training's binary_logloss: 0.0533771	valid_1's auc: 0.943638	valid_1's binary_logloss: 0.0617744
[196]	training's auc: 0.961755	training's binary_logloss: 0.0533416	valid_1's auc: 0.94364	valid_1's binary_logloss: 0.0617762
[197]	training's auc: 0.961896	training's binary_logloss: 0.0533083	valid_1's auc: 0.943653	valid_1's binary_logloss: 0.0617753
[198]	training's auc: 0.961926	training's binary_logloss: 0.0532726	valid_1's auc: 0.943648	valid_1's binary_logloss: 0.0617766
[199]	training's auc: 0.961967	training's binary_logloss: 0.0532411	valid_1's auc: 0.943647	valid_1's binary_logloss: 0.0617814
[200]	training's auc: 0.962023	training's binary_logloss: 0.0532053	valid_1's auc: 0.943641	valid_1's binary_logloss: 0.061789


LGBMClassifier(min_child_weight=10, n_estimators=200)

In [324]:
prediction1 = clf_xgb.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction1)

0.9433878511198031

In [354]:
prediction2 = clf_cat.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction2)

0.9426856064599045

In [336]:
prediction3 = clf_lgb.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction3)

0.9436410264682491

In [355]:
prediction = (prediction1 + prediction2 + prediction3)
roc_auc_score(ids_y_test, prediction)

0.9447508839363452

In [356]:
predDetails = pd.DataFrame({"sess_id" : ids_sess_test.astype(int),
                            "item" : ids_item_test.astype(int), "score" : prediction})

In [357]:
ix_bests = predDetails.groupby("sess_id")["score"].idxmax()

In [358]:
accuracy_score(y_test[zp:], predDetails.loc[ix_bests, "item"]) # 7 feats + agg (bad lgbm?)

0.22097441731006606

In [359]:
sess_scores = dict()
for (sess_id, item), score in zip(predDetails[["sess_id", "item"]].values, predDetails["score"].values):
    if sess_id in sess_scores:
        sess_scores[sess_id].append((score, item))
    else:
        sess_scores[sess_id] = [(score, item)]

In [360]:
sess_ids = sorted(predDetails["sess_id"].unique())

In [361]:
top10prediction = []
for sess_id in sess_ids:
    x = sess_scores[sess_id]
    x = sorted(x, reverse = True)
    pred = [item for score, item in x]
    pred = pred + [-1] * 10
    pred = pred[:10]
    top10prediction.append(pred)

In [362]:
score = 0,
scoreDom = 0
score_tot = 0,
scoreDom_tot = 0
score_cum = 0
scoreDom_cum = 0
for i in range(10):
    score = accuracy_score(y_test[zp:], np.array(top10prediction)[:, i])
    score_cum += score
    score_tot += score / np.log(2 + i)
    scoreDom = accuracy_score(ydom_test[zp:], np.vectorize(lambda x : itemDomain.get(x, "<UNKN>"))(np.array(top10prediction)[:, i]))
    scoreDom_tot += scoreDom / np.log(2 + i)
    print("%.5f - %.5f - %.5f \t\t %.5f - %.5f" % (score, score_cum, score_tot,
                                                        scoreDom, scoreDom_tot))

print()

0.22097 - 0.22097 - 0.31880 		 0.41702 - 0.60163
0.04722 - 0.26819 - 0.36178 		 0.36280 - 0.93187
0.02052 - 0.28872 - 0.37659 		 0.34949 - 1.18398
0.01176 - 0.30048 - 0.38389 		 0.33843 - 1.39426
0.00784 - 0.30832 - 0.38827 		 0.31941 - 1.57252
0.00569 - 0.31401 - 0.39119 		 0.28848 - 1.72077
0.00387 - 0.31788 - 0.39306 		 0.25658 - 1.84416
0.00315 - 0.32103 - 0.39449 		 0.21730 - 1.94305
0.00220 - 0.32323 - 0.39544 		 0.17286 - 2.01812
0.00148 - 0.32471 - 0.39606 		 0.13595 - 2.07482



In [363]:
def predict_vc4(uhs_train, vista_compra, vista_count, compra_count, domain_count, knnrecs, knndists, ranked_recomms):
    N_SEARCH = 1000
    prediction = []
    dom_preds = []
    
    for session, recs, kds, sess_post in zip(uhs_train, knnrecs, knndists, ranked_recomms):
        
        sess_post = [item for item in sess_post if item != -1]
        
        assert len(sess_post) == len(set(sess_post)), "vish"
        
        unique_recs = []
        unique_recsdist = dict()
        for r, d in zip(recs, kds):
            if r != -1 and r not in unique_recsdist:
                unique_recs.append(r)
                unique_recsdist[r] = d
        
        view_events = [event for event in session if event["event_type"] == "view"]

            
        unique_set = set()
        count = dict()
        unique_order = []
        special_selected = []
        for i, event in enumerate(view_events[::-1]):
            item_seen = event["event_info"]
            if item_seen not in unique_set:
                unique_set.add(item_seen)
                unique_order.append(item_seen)
            count[item_seen] = count.get(item_seen, 0) + 1
#             if i == n_recent:
#                 special_selected = unique_order.copy()
            if len(unique_order) >= N_SEARCH:
                break
        sel_items = unique_order[:10]
        
        if len(sel_items) > 0:
            sel_dom_count = {}
            for item in sel_items:
                dom = itemDomain.get(item, "<UNKN>")
                sel_dom_count[dom] = sel_dom_count.get(dom, 0) + 1
            best_dom = max(sel_dom_count, key = sel_dom_count.get)
        else:
            best_dom = "<UNKN>"
        
        final_score = dict()
        final_scoreDom = dict()
        for vista in unique_order[:20]:
            for compra, qtd in vista_compra.get(vista, dict()).items():
                dom = itemDomain.get(compra, "<UNKN>")
                final_score[compra] = final_score.get(compra, 0) + qtd
                final_scoreDom[dom] = final_scoreDom.get(dom, 0) + qtd
        if len(final_scoreDom) > 0:
            best_dom = max(final_scoreDom, key = lambda x : (final_scoreDom.get(x, 0), sel_dom_count.get(x, 0)))
        
        pred = []
        pred_set = set()
        if len(sel_items) > 0:            
            extras = sorted(final_score, key = final_score.get, reverse = True)[:10]
            extras_filtered = {item for item in extras if item not in unique_set}
            
#             best = max(sel_items, key = lambda k : (compra_count.get(k, 0)))
            best_list = sorted(sel_items, key = lambda k : (compra_count.get(k, 0), itemCondition.get(k, 0)),
                               reverse = True)
            best = best_list[0]
            best_dom = itemDomain[best] # Careful with this line -> might need to change in the future
            pred = pred + [best]
            pred_set.add(best)
            
                
            top_posts = sess_post[:2]
            if best not in top_posts:
                pred = top_posts + pred
            else:
                pred = top_posts
                
            pred_set = set(pred)    
            best_dom = itemDomain[pred[0]]             



            leftings = unique_set | extras_filtered
            leftings = {item for item in leftings if (itemDomain[item] == best_dom and item not in pred_set)}
            tail = sorted(leftings, key = lambda k : (final_score.get(k, 0), compra_count.get(k, 0)), reverse = True)
            pred = pred + tail
            pred_set = set(pred)
            
            replacement = top20domItems.get(best_dom, [])
            replacement = [item for item in replacement if (item not in pred_set)]
            pred = pred + replacement
            pred = pred[:10]
        else:
            pred = []
        
        n_missing = 10 - len(pred)
        replacement = [item for item in sess_post if item not in pred]
        pred = pred + replacement[:n_missing]
        
        n_missing = 10 - len(pred)
        replacement = [item for item in unique_recs if item not in pred]
        pred = pred + replacement[:n_missing]
        
        n_missing = 10 - len(pred)
        replacement = [item for item in topDomainTop20itens if item not in pred]
        pred = pred + replacement[:n_missing]
        
        assert len(set(pred)) == 10, "item repetido"
        
        prediction.append(pred)
        dom_preds.append(best_dom)
        
    return prediction, dom_preds

In [364]:
prediction, dom_preds = predict_vc4(X_test[zp:],
        vista_compra_train, vista_count_train,
        compra_count_train, domain_count_train,
        recomms_test[zp:], knndists_test[zp:], top10prediction)

print("DOM:", accuracy_score(ydom_test[zp:], dom_preds))
score = 0,
scoreDom = 0
score_tot = 0,
scoreDom_tot = 0
score_cum = 0
scoreDom_cum = 0
for i in range(10):
    score = accuracy_score(y_test[zp:], np.array(prediction)[:, i])
    score_cum += score
    score_tot += score / np.log(2 + i)
    scoreDom = accuracy_score(ydom_test[zp:], np.vectorize(itemDomain.get)(np.array(prediction)[:, i]))
    scoreDom_tot += scoreDom / np.log(2 + i)
    print("%.5f - %.5f - %.5f \t\t %.5f - %.5f" % (score, score_cum, score_tot,
                                                        scoreDom, scoreDom_tot))

print()

DOM: 0.40973449185565264
0.22097 - 0.22097 - 0.31880 		 0.41702 - 0.60163
0.04722 - 0.26819 - 0.36178 		 0.36343 - 0.93244
0.01956 - 0.28775 - 0.37589 		 0.39964 - 1.22072
0.01297 - 0.30072 - 0.38395 		 0.41332 - 1.47753
0.00922 - 0.30995 - 0.38909 		 0.41087 - 1.70684
0.00733 - 0.31728 - 0.39286 		 0.40990 - 1.91749
0.00578 - 0.32306 - 0.39564 		 0.40782 - 2.11361
0.00571 - 0.32878 - 0.39824 		 0.40562 - 2.29822
0.00411 - 0.33289 - 0.40003 		 0.40310 - 2.47329
0.00392 - 0.33681 - 0.40167 		 0.40049 - 2.64030



<hr>

## Ranker training with more data

In [367]:
%%time
temp_yfolds = []
temp_Xfolds = []
for j, (train_index, test_index) in enumerate(splits):
    print(j)
    
    recomms_test = recomms_folds[j]
    knndists_test = dists_folds[j]
    
    
    X_train, y_train, ydom_train = uhs_train[train_index], target[train_index], domain[train_index]
    X_test, y_test, ydom_test = uhs_train[test_index], target[test_index], domain[test_index]
    session_id_test = session_id[test_index]
    sess_node_features_test = sess_node_features_full[test_index]
    
    print("splitted")

    vista_compra_train = get_vista_compra_l10(X_train, y_train) #l10 here improves dom pos2
    vista_count_train = get_vista_count(X_train)
    compra_count_train = pd.Series(y_train).value_counts()
    #     compra_count_train = get_compra_count_power(X_train, y_train)
    domain_count_train = pd.Series(ydom_train).value_counts()
    
    print("X_train processed")
    
    item_ds_test = get_item_ds(X_test,
        vista_compra_train, vista_count_train, compra_count_train, domain_count_train,
                               recomms_test, knndists_test, session_id_test, sess_node_features_test)
    
    print("Got ids")
    
    ids_real_targets = target[item_ds_test[:, 0].astype(int)]

    fold_y = 1 * (ids_real_targets == item_ds_test[:, 1].astype(int))
    
    fold_X = get_extra_ids_features(item_ds_test)
    
    temp_yfolds.append(fold_y)
    temp_Xfolds.append(fold_X)
    
    print("finished fold")

0
splitted
X_train processed
Got ids
finished fold
1
splitted
X_train processed
Got ids
finished fold
2
splitted
X_train processed
Got ids
finished fold
3
splitted
X_train processed
Got ids
finished fold
4
splitted
X_train processed
Got ids
finished fold
Wall time: 3min 58s


In [368]:
X = np.concatenate(temp_Xfolds, axis = 0)
y = np.concatenate(temp_yfolds, axis = 0)

In [369]:
zp = int(len(uhs_train) * .5)
zp

206581

In [370]:
wp = -1
initial_sess_id = X[0, 0]
for i, sess_id in enumerate(X[:, 0]):
    if (sess_id - initial_sess_id) == zp:
        wp = i

In [371]:
ids_X_train, ids_y_train, ids_item_train, ids_sess_train = X[:wp,2:], y[:wp], X[:wp, 1], X[:wp, 0]
ids_X_test, ids_y_test, ids_item_test, ids_sess_test = X[wp:,2:], y[wp:], X[wp:, 1], X[wp:, 0]

In [372]:
len(ids_X_train), len(ids_X_test)

(2892827, 2897337)

In [373]:
clf_xgb.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
    eval_metric="auc", verbose=True,
#     sample_weight = ids_sw_train,
)

[0]	validation_0-auc:0.92968	validation_1-auc:0.92996
[1]	validation_0-auc:0.93144	validation_1-auc:0.93175
[2]	validation_0-auc:0.93197	validation_1-auc:0.93220
[3]	validation_0-auc:0.93627	validation_1-auc:0.93632
[4]	validation_0-auc:0.93735	validation_1-auc:0.93732
[5]	validation_0-auc:0.93904	validation_1-auc:0.93911
[6]	validation_0-auc:0.93935	validation_1-auc:0.93940
[7]	validation_0-auc:0.93967	validation_1-auc:0.93971
[8]	validation_0-auc:0.93991	validation_1-auc:0.93995
[9]	validation_0-auc:0.94017	validation_1-auc:0.94020
[10]	validation_0-auc:0.94040	validation_1-auc:0.94042
[11]	validation_0-auc:0.94083	validation_1-auc:0.94075
[12]	validation_0-auc:0.94104	validation_1-auc:0.94098
[13]	validation_0-auc:0.94162	validation_1-auc:0.94142
[14]	validation_0-auc:0.94191	validation_1-auc:0.94161
[15]	validation_0-auc:0.94239	validation_1-auc:0.94193
[16]	validation_0-auc:0.94256	validation_1-auc:0.94205
[17]	validation_0-auc:0.94329	validation_1-auc:0.94254
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=15, missing=-1, monotone_constraints='()',
              n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=1)

In [374]:
clf_cat.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
#     eval_metric="auc", verbose=True,
)

Learning rate set to 0.226911
0:	test: 0.8750884	test1: 0.8758519	best: 0.8758519 (0)	total: 523ms	remaining: 8m 42s
10:	test: 0.9277928	test1: 0.9283274	best: 0.9283274 (10)	total: 3.6s	remaining: 5m 23s
20:	test: 0.9361684	test1: 0.9367793	best: 0.9367793 (20)	total: 6.8s	remaining: 5m 16s
30:	test: 0.9381610	test1: 0.9386225	best: 0.9386225 (30)	total: 10.2s	remaining: 5m 17s
40:	test: 0.9396588	test1: 0.9399586	best: 0.9399586 (40)	total: 13.6s	remaining: 5m 17s
50:	test: 0.9407918	test1: 0.9409705	best: 0.9409705 (50)	total: 16.9s	remaining: 5m 15s
60:	test: 0.9415648	test1: 0.9416751	best: 0.9416751 (60)	total: 20.2s	remaining: 5m 11s
70:	test: 0.9421728	test1: 0.9421572	best: 0.9421572 (70)	total: 23.6s	remaining: 5m 8s
80:	test: 0.9429292	test1: 0.9428928	best: 0.9428928 (80)	total: 26.9s	remaining: 5m 5s
90:	test: 0.9435390	test1: 0.9434040	best: 0.9434040 (90)	total: 30.2s	remaining: 5m 1s
100:	test: 0.9438036	test1: 0.9435677	best: 0.9435677 (100)	total: 33.5s	remaining: 4m 

900:	test: 0.9515485	test1: 0.9466324	best: 0.9466417 (860)	total: 4m 54s	remaining: 32.3s
910:	test: 0.9515987	test1: 0.9466355	best: 0.9466417 (860)	total: 4m 57s	remaining: 29.1s
920:	test: 0.9516472	test1: 0.9466313	best: 0.9466417 (860)	total: 5m	remaining: 25.8s
930:	test: 0.9516921	test1: 0.9466280	best: 0.9466417 (860)	total: 5m 4s	remaining: 22.5s
940:	test: 0.9517437	test1: 0.9466467	best: 0.9466467 (940)	total: 5m 7s	remaining: 19.3s
950:	test: 0.9517956	test1: 0.9466479	best: 0.9466479 (950)	total: 5m 10s	remaining: 16s
960:	test: 0.9518631	test1: 0.9466435	best: 0.9466479 (950)	total: 5m 14s	remaining: 12.7s
970:	test: 0.9519123	test1: 0.9466346	best: 0.9466479 (950)	total: 5m 17s	remaining: 9.48s
980:	test: 0.9519766	test1: 0.9466276	best: 0.9466479 (950)	total: 5m 20s	remaining: 6.21s
990:	test: 0.9520226	test1: 0.9466189	best: 0.9466479 (950)	total: 5m 24s	remaining: 2.94s
999:	test: 0.9520639	test1: 0.9466109	best: 0.9466479 (950)	total: 5m 27s	remaining: 0us

bestTest

<catboost.core.CatBoostClassifier at 0x2a73d567548>

In [375]:
clf_lgb.fit(
    ids_X_train, ids_y_train,
    eval_set=[(ids_X_train, ids_y_train), (ids_X_test, ids_y_test)],
    eval_metric="auc", verbose=True,
)

[1]	training's auc: 0.898601	training's binary_logloss: 0.087164	valid_1's auc: 0.898962	valid_1's binary_logloss: 0.0873183
[2]	training's auc: 0.924608	training's binary_logloss: 0.0825167	valid_1's auc: 0.925679	valid_1's binary_logloss: 0.0826777
[3]	training's auc: 0.932236	training's binary_logloss: 0.0791883	valid_1's auc: 0.932851	valid_1's binary_logloss: 0.0793483
[4]	training's auc: 0.932621	training's binary_logloss: 0.0765705	valid_1's auc: 0.933232	valid_1's binary_logloss: 0.0767274
[5]	training's auc: 0.932821	training's binary_logloss: 0.074487	valid_1's auc: 0.93344	valid_1's binary_logloss: 0.0746382
[6]	training's auc: 0.934713	training's binary_logloss: 0.0727754	valid_1's auc: 0.935234	valid_1's binary_logloss: 0.0729238
[7]	training's auc: 0.935201	training's binary_logloss: 0.0713361	valid_1's auc: 0.93573	valid_1's binary_logloss: 0.0714797
[8]	training's auc: 0.935391	training's binary_logloss: 0.0700973	valid_1's auc: 0.93591	valid_1's binary_logloss: 0.07023

[66]	training's auc: 0.946223	training's binary_logloss: 0.0605646	valid_1's auc: 0.945322	valid_1's binary_logloss: 0.0609943
[67]	training's auc: 0.946273	training's binary_logloss: 0.060542	valid_1's auc: 0.945361	valid_1's binary_logloss: 0.0609778
[68]	training's auc: 0.946322	training's binary_logloss: 0.0605211	valid_1's auc: 0.945373	valid_1's binary_logloss: 0.0609663
[69]	training's auc: 0.94642	training's binary_logloss: 0.0604957	valid_1's auc: 0.945446	valid_1's binary_logloss: 0.0609473
[70]	training's auc: 0.946468	training's binary_logloss: 0.0604756	valid_1's auc: 0.945472	valid_1's binary_logloss: 0.0609375
[71]	training's auc: 0.94656	training's binary_logloss: 0.060454	valid_1's auc: 0.945525	valid_1's binary_logloss: 0.060924
[72]	training's auc: 0.946593	training's binary_logloss: 0.060435	valid_1's auc: 0.945542	valid_1's binary_logloss: 0.0609157
[73]	training's auc: 0.946662	training's binary_logloss: 0.060415	valid_1's auc: 0.945596	valid_1's binary_logloss: 0

[131]	training's auc: 0.948552	training's binary_logloss: 0.0595626	valid_1's auc: 0.946533	valid_1's binary_logloss: 0.0605492
[132]	training's auc: 0.948572	training's binary_logloss: 0.0595488	valid_1's auc: 0.946548	valid_1's binary_logloss: 0.0605421
[133]	training's auc: 0.948602	training's binary_logloss: 0.0595359	valid_1's auc: 0.946575	valid_1's binary_logloss: 0.0605349
[134]	training's auc: 0.948626	training's binary_logloss: 0.0595229	valid_1's auc: 0.946586	valid_1's binary_logloss: 0.0605304
[135]	training's auc: 0.94866	training's binary_logloss: 0.0595096	valid_1's auc: 0.946598	valid_1's binary_logloss: 0.0605265
[136]	training's auc: 0.94868	training's binary_logloss: 0.0594988	valid_1's auc: 0.946601	valid_1's binary_logloss: 0.0605255
[137]	training's auc: 0.94871	training's binary_logloss: 0.0594881	valid_1's auc: 0.946611	valid_1's binary_logloss: 0.0605243
[138]	training's auc: 0.948743	training's binary_logloss: 0.0594782	valid_1's auc: 0.946614	valid_1's binar

[196]	training's auc: 0.950031	training's binary_logloss: 0.0588706	valid_1's auc: 0.946876	valid_1's binary_logloss: 0.0604419
[197]	training's auc: 0.950051	training's binary_logloss: 0.0588596	valid_1's auc: 0.946882	valid_1's binary_logloss: 0.0604394
[198]	training's auc: 0.950097	training's binary_logloss: 0.0588488	valid_1's auc: 0.946886	valid_1's binary_logloss: 0.0604365
[199]	training's auc: 0.950119	training's binary_logloss: 0.0588384	valid_1's auc: 0.946885	valid_1's binary_logloss: 0.0604367
[200]	training's auc: 0.950134	training's binary_logloss: 0.0588306	valid_1's auc: 0.946884	valid_1's binary_logloss: 0.0604373


LGBMClassifier(min_child_weight=10, n_estimators=200)

In [376]:
prediction1 = clf_xgb.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction1)

0.9465064362106773

In [377]:
prediction2 = clf_cat.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction2)

0.946647892835239

In [378]:
prediction3 = clf_lgb.predict_proba(ids_X_test)[:, -1]
roc_auc_score(ids_y_test, prediction3)

0.9468840881729528

In [379]:
prediction = (prediction1 + prediction2 + prediction3)
roc_auc_score(ids_y_test, prediction)

0.9474812602253828

In [380]:
predDetails = pd.DataFrame({"sess_id" : ids_sess_test.astype(int),
                            "item" : ids_item_test.astype(int), "score" : prediction})

In [381]:
ix_bests = predDetails.groupby("sess_id")["score"].idxmax()

In [382]:
accuracy_score(target[zp:], predDetails.loc[ix_bests, "item"])

0.22390140476905054

<hr>

## Submission

In [383]:
ind_sub = pd.read_csv("data/knn/inds_sub.csv", header = None).values[:, :20]
recomms_sub =  pd.read_csv("data/knn/recomms_sub.csv", header = None).values[:, :20]
knndists_sub = pd.read_csv("data/knn/dists_sub.csv", header = None).values[:, :20]

In [384]:
X_train, y_train, ydom_train = uhs_train, target, domain
X_test = uhs_test

vista_compra_train = get_vista_compra_l10(X_train, y_train) #l10 here improves dom pos2
vista_count_train = get_vista_count(X_train)
compra_count_train = pd.Series(y_train).value_counts()
#     compra_count_train = get_compra_count_power(X_train, y_train)
domain_count_train = pd.Series(ydom_train).value_counts()

In [385]:
graph_data_sub = get_graphdata(uhs_test)
sess_node_features_sub = np.array(get_sess_node_features(graph_data_sub))

In [386]:
dummy_session_id = np.arange(len(X_test))

In [387]:
%%time
item_ds_sub = get_item_ds(X_test,
        vista_compra_train, vista_count_train, compra_count_train, domain_count_train,
                          recomms_sub, knndists_sub, dummy_session_id, sess_node_features_sub)

Wall time: 1min 26s


In [388]:
item_ds_sub_plus = get_extra_ids_features(item_ds_sub)

In [389]:
prediction1 = clf_xgb.predict_proba(item_ds_sub_plus[:,2:])[:, -1]

In [390]:
prediction2 = clf_cat.predict_proba(item_ds_sub_plus[:,2:])[:, -1]

In [391]:
prediction3 = clf_lgb.predict_proba(item_ds_sub_plus[:,2:])[:, -1]

In [392]:
prediction = (prediction1 + prediction2 + prediction3)

In [393]:
ids_item, ids_sess = item_ds_sub_plus[:, 1], item_ds_sub_plus[:, 0]

In [394]:
predDetails = pd.DataFrame({"sess_id" : ids_sess.astype(int),
                            "item" : ids_item.astype(int), "score" : prediction})

In [395]:
ix_bests = predDetails.groupby("sess_id")["score"].idxmax()

In [396]:
sess_scores = dict()
for (sess_id, item), score in zip(predDetails[["sess_id", "item"]].values, predDetails["score"].values):
    if sess_id in sess_scores:
        sess_scores[sess_id].append((score, item))
    else:
        sess_scores[sess_id] = [(score, item)]

In [397]:
sess_ids = sorted(predDetails["sess_id"].unique())

In [398]:
top10prediction = []
for sess_id in sess_ids:
    x = sess_scores[sess_id]
    x = sorted(x, reverse = True)
    pred = [item for score, item in x]
    pred = pred + [-1] * 10
    pred = pred[:10]
    top10prediction.append(pred)

In [399]:
prediction, dom_preds = predict_vc4(X_test,
        vista_compra_train, vista_count_train,
        compra_count_train, domain_count_train,
        recomms_sub, knndists_sub, top10prediction)

In [400]:
submission = pd.DataFrame(prediction)

In [401]:
submission.to_csv("submissions/RANKER04.csv", sep = ',', header = False, index = False)

In [402]:
with open("submissions/RANKER04.csv", "r") as f:
    for i in range(3):
        print(f.readline())

1572239,1717880,1439187,1864599,350187,1006988,457681,758382,1652445,762780

849692,943786,1506643,17614,1898817,1199728,997446,1155969,123190,1010719

1453414,2032477,1614538,1362449,167018,1219935,539599,659207,1425924,1260633



In [None]:
# score