In [1]:
# data manipulation
import numpy as np
import pandas as pd
import json
from scipy import sparse

In [2]:
# data pipeline
from sklearn.model_selection import StratifiedKFold, KFold

In [3]:
# metrics
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import roc_curve

In [4]:
# utils
from collections import Counter
import matplotlib.pyplot as plt

In [5]:
# preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.decomposition import PCA, TruncatedSVD

In [6]:
# models

from sklearn.neighbors import KNeighborsClassifier

<hr>

## Loading data and other useful stuffs

In [8]:
data_train = pd.read_json("data/train_dataset.jl", lines=True)

In [9]:
%%time
for sess in data_train["user_history"].values:
    for event in sess:
        event["event_timestamp"] = np.datetime64(event["event_timestamp"])

  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 23.5 s


In [10]:
data_items = pd.read_json("data/item_data.jl", lines=True)

In [11]:
data_test = pd.read_json("data/test_dataset.jl", lines=True)

In [12]:
%%time
for sess in data_test["user_history"].values:
    for event in sess:
        event["event_timestamp"] = np.datetime64(event["event_timestamp"])

  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 11.2 s


In [13]:
# item features

In [14]:
data_items.set_index("item_id", inplace = True)

In [15]:
data_items["domain_id"].nunique()

7893

In [16]:
data_items["price"].isna().mean()

0.00016125372631675084

In [17]:
data_items_price = data_items["price"]
data_items_price.dropna(inplace = True)

In [18]:
itemPrice = data_items_price.to_dict()        

itemDomain = data_items["domain_id"].to_dict()

for i, d in itemDomain.items():
    if d is None:
        itemDomain[i] = "<UNKN>"

itemCondition = data_items["condition"].map({"new" : 1, "used" : 0, None : -1}).to_dict()

for i, d in itemCondition.items():
    if d is None:
        print(a)

In [19]:
uhs_train = data_train["user_history"].values
target = data_train["item_bought"].values
uhs_test= data_test["user_history"].values

In [20]:
domain = np.vectorize(itemDomain.get)(target)

In [21]:
domainCode = dict()
for i, dom in enumerate(set(domain)):
    domainCode[dom] = i

In [22]:
pd.Series(domain).value_counts(normalize=True)

MLB-CELLPHONES            0.060678
MLB-SNEAKERS              0.035357
MLB-SUPPLEMENTS           0.023143
MLB-HEADPHONES            0.021911
MLB-SMARTWATCHES          0.019273
                            ...   
MLM-MEN_SPORT_SWIMWEAR    0.000002
MLM-LIP_GLOSSES           0.000002
MLM-FOOTBALL_SOCKS        0.000002
MLM-NAIL_SAMPLES          0.000002
MLM-SEAT_BELTS            0.000002
Length: 3214, dtype: float64

In [23]:
most_bought_domain = pd.Series(domain).value_counts().idxmax()

In [24]:
most_bought_domain

'MLB-CELLPHONES'

In [25]:
topDomainTop20itens = list(pd.Series(target[domain == most_bought_domain]).value_counts(ascending=False).index[:20])

In [26]:
domItemFreq = dict()
for i, d in zip(target, domain):
    domItemFreq[d] = domItemFreq.get(d, list()) + [i]

In [27]:
top20domItems = dict()
for d, ilist in domItemFreq.items():
    top20domItems[d] = list(pd.Series(ilist).value_counts().index[:20])

In [28]:
itemAsLabelCount = pd.Series(target).value_counts().to_dict()
most_sold = pd.Series(itemAsLabelCount).idxmax()

In [29]:
itemDomain[most_sold]

'MLB-HEADPHONES'

In [30]:
topDomainTop10itens = list(pd.Series(target[domain == most_bought_domain]).value_counts(ascending=False).index[:10])

In [31]:
topDomainTop10itens

[859574,
 1371799,
 119703,
 1332849,
 882697,
 1098739,
 98853,
 790888,
 967194,
 1595373]

In [32]:
cumsum = pd.Series(target).value_counts(normalize = True).cumsum()

In [33]:
# splitter = StratifiedKFold(n_splits = 2, shuffle = True, random_state=666)
# splits = list(splitter.split(uhs_train, target))

In [34]:
%%time

queries = []
viewed_items = []
viewed_domains = []
num_queries = []
for session in uhs_train:
    session_viewed_items = dict()
    session_viewed_domains = dict()
    session_queries = []
    s = 0
    for event in session:
        if event["event_type"] == "view":
            item_seen = event["event_info"]
            session_viewed_items[item_seen] = session_viewed_items.get(item_seen, 0) + 1
            domain_seen = itemDomain[item_seen]
            domain_seen = "<UKNW>" if domain_seen is None else domain_seen
            session_viewed_domains[domain_seen] = session_viewed_domains.get(domain_seen, 0) + 1
        else:
            session_queries.append(event["event_info"])
            s += 1
    viewed_items.append(session_viewed_items)
    viewed_domains.append(session_viewed_domains)
    queries.append(" ".join(session_queries))
    num_queries.append(s)

Wall time: 7.49 s


In [35]:
sessions_length = np.vectorize(len)(uhs_train)

In [36]:
num_viewed = np.vectorize(len)(viewed_items)

In [37]:
num_queries = np.array(num_queries)

In [38]:
top1item = []
isTopItem = np.zeros(len(target), int)
containItemLabel = np.zeros(len(target), int)
containDomainLabel = np.zeros(len(target), int)
for i, (session, sessDoms, label, labelDom) in enumerate(zip(viewed_items, viewed_domains, target, domain)):
    if len(session) > 0:
        most = max(session, key = lambda k : session[k])
    else:
        most = -1
        
    labelDom
    top1item.append(most)
    isTopItem[i] = 1 if most == label else 0
    containItemLabel[i] = 1 if label in session else 0
    containDomainLabel[i] = 1 if labelDom in sessDoms else 0

In [39]:
containItemLabel.mean(), containDomainLabel.mean()

(0.29388401187908886, 0.49340575027289474)

In [40]:
accuracy_score(target, top1item)

0.1568267245614927

<hr>

## Preprocessing data (Bags of Words: Session items, Session domains, Session queries words)

In [41]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
#     curr = curr.str.replace('[0-9]', '0')
    
    # remove some Pt plurals
    curr = curr.str.replace('\\b([a-zA-Z]+[aeiouwy])(s)\\b', r'\1')
    
    # remove 4 consec (same) letters to just one
    curr = curr.str.replace(r'([a-zA-Z])\1{3,}', r'\1') # 3 is four? -> three of \1 after first \1... 
    
    return curr

In [42]:
sp1 = int(.8 * len(uhs_train))
sp2 = len(uhs_train)
uhs_all = np.concatenate([uhs_train, uhs_test], axis = 0)

In [43]:
# >> COLLECT LAST 20 ITEMS AND QUERIES
l20_items = []
l20_searches = []
l20_domains = []
num_queries = []
num_items = []
sess_len = []
for session in uhs_all:
    items = [event["event_info"] for event in session if event["event_type"] == "view"]
    searches = [event["event_info"] for event in session if event["event_type"] == "search"]
    l20_items.append({i : 1 for i in items[:-21:-1]})
    l20_domains.append({itemDomain.get(i, "<UNKN>") : 1 for i in items[:-21:-1]})
    l20_searches.append(" ".join(searches))
    num_queries.append(len(searches))
    num_items.append(len(items))
    sess_len.append(len(session))
print("Information collected from user histories")


# >> ITEMS OH
dv_items = DictVectorizer(dtype = int)
items_bow = dv_items.fit_transform(l20_items)
items_df = np.array(items_bow.sum(axis = 0))[0]
# (items_df == 1).sum() 
items_bow = items_bow[:, items_df > 1] # removing can improve top1 but worst the overall
# items_bow.shape
print("Items BOW created")

# >> DOMAIN OH
dv_domains = DictVectorizer()
domains_bow = dv_domains.fit_transform(l20_domains)
domains_df = np.array(domains_bow.sum(axis = 0))[0]
# print((domains_df == 1).sum()) 
# I think I am missing a line where I should remove these domains that only appear once
print("Domains BOW created")

# >> QUERIES OH
normalized = normalize(pd.Series(l20_searches))
cv_queries = CountVectorizer(binary = True, min_df = 5, max_df = .5)
queries_bow = cv_queries.fit_transform(normalized)
# docfreq = np.array(queries_bow.sum(axis = 0)).flatten() / queries_bow.shape[0]
# inv_vocab = {v : k for k,v in cv_queries.vocabulary_.items()}
# np.vectorize(inv_vocab.get)(np.argsort(docfreq)[-100:])
# np.sort(docfreq)[-10:]
print("Queries BOW created")




# queries_bowtf = TruncatedSVD(100).fit_transform(queries_bowtf)
# items_bowtf = TruncatedSVD(100).fit_transform(items_bowtf)
# domains_bowtf = TruncatedSVD(100).fit_transform(domains_bowtf)
# X = np.concatenate([queries_bowtf, items_bowtf, domains_bowtf], axis = 1)
# # works quite well



Information collected from user histories
Items BOW created
Domains BOW created
Queries BOW created


In [44]:
extras = np.column_stack([
    num_queries,
    num_items,
    sess_len
])
extras = extras / extras.sum(axis = 0)

# # these extras were wort -> top20 got dissipated

In [45]:
# BOW NORMALIZATION
# tft_queries = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
# queries_bowtf = tft_queries.fit_transform(queries_bow)

# tft_items = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
# items_bowtf = tft_queries.fit_transform(items_bow)

# tft_domains = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=True, sublinear_tf=False)
# domains_bowtf = tft_queries.fit_transform(domains_bow)
# print("BOWs normalized")

# CONCATENATING
X = sparse.hstack([queries_bow / 4, items_bow, domains_bow], format = 'csr')
tft_X = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
X = tft_X.fit_transform(X)
print("BOWs concatenated")

BOWs concatenated


In [46]:
# >> SIMPLIFY PROBLEM (1): FILTER CLASSES
# N_CLASSES = 1500
N_CLASSES = 40000
data_prop = cumsum.iloc[N_CLASSES]
print("Proportion of data used:", data_prop)
sel_classes = cumsum.index[:N_CLASSES].values
simple_target = target.copy()
simple_target[~pd.Series(simple_target).isin(sel_classes)] = -1
#pd.Series(simple_target).nunique()

X_train, y_train, yalt_train = X[:sp1], target[:sp1], simple_target[:sp1]
X_test, y_test, yalt_test = X[sp1:sp2], target[sp1:sp2], simple_target[sp1:sp2]
X_full, y_full, yalt_full = X[:sp2], target, simple_target
X_sub = X[sp2:]


mask_train = containItemLabel[:sp1] == 1
mask_full = containItemLabel == 1
# mask_train = containDomainLabel[:sp1] == 1
# mask_full = containDomainLabel == 1
# top20 gets better sorted with containItem mask

# SIMPLIFY PROBLEM (2): GETS EASIER SESSIONS 
X_train_simple = X_train[mask_train]
y_train_simple = y_train[mask_train]
yalt_train_simple = yalt_train[mask_train]

X_full_simple = X_full[mask_full]
y_full_simple = y_full[mask_full]
yalt_full_simple = yalt_full[mask_full]

print("Data splitted")



Proportion of data used: 0.9286092897955355
Data splitted


In [47]:
NN = 50

## Using KNN in a 5 fold split to generate candidates for all trainig data and avoid leakeage
(yeah... I know there is more effient way to do this without using folds)

In [261]:
%%time
splitter = KFold(n_splits = 5, shuffle = False)
splits = list(splitter.split(X_full, y_full, containItemLabel))

Wall time: 7 ms


In [262]:
%%time

for j, (train_index, test_index) in enumerate(splits):
    # test_index = test_index[:1000]
    
    X_train, y_train, yalt_train = X_full[train_index], y_full[train_index], yalt_full[train_index]
    X_test, y_test, yalt_test = X_full[test_index], y_full[test_index], yalt_full[test_index]
    
    mask_train = containItemLabel[train_index] == 1
    X_train_simple = X_train[mask_train]
    y_train_simple = y_train[mask_train]
    yalt_train_simple = yalt_train[mask_train]
    
    clf_knn_1 = KNeighborsClassifier(
        n_neighbors=10,
        n_jobs=-1,
        leaf_size=30,
        p=1,
        metric='cosine',
    ).fit(
        X_train_simple,
        yalt_train_simple,
    )
    
    dist_test, ind_test = clf_knn_1.kneighbors(X_test, NN)
    
    recomms_test = np.zeros((len(y_test), NN), int)
    for i in range(NN):
        recomms_test[:, i] = yalt_train_simple[ind_test[:,i]]

    knndists_test = dist_test[:, :NN]
    
    ind_df = pd.DataFrame(ind_test)
    ind_df.to_csv("data/knn/inds_%d.csv" % j, index = False, header = False)
    
    recomms_df = pd.DataFrame(recomms_test)
    recomms_df.to_csv("data/knn/recomms_%d.csv" % j, index = False, header = False)
    
    dists_df = pd.DataFrame(knndists_test)
    dists_df.to_csv("data/knn/dists_%d.csv" % j, index = False, header = False)
    
    # leave them splitted to create the good habit of generating the other features in the correct way
    
    print(accuracy_score(y_test, recomms_test[:, 0]))

0.12729781080197983
0.12729781080197983
0.12873791342441882
0.12658534224029433
0.12758979572078613
Wall time: 12min 16s


## Generating candidates for the submission

In [48]:
%%time
# TRAIN KNN
clf_knn_2 = KNeighborsClassifier(
    n_neighbors=10,
    n_jobs=-1,
    #leaf_size=30,
    p=1,
    metric='cosine',
).fit(
    X_full_simple,
    yalt_full_simple,
)

dist_sub, ind_sub = clf_knn_2.kneighbors(X_sub, NN)

Wall time: 6min 14s


In [51]:
recomms_sub = np.zeros((X_sub.shape[0], NN), int)
for i in range(NN):
    recomms_sub[:, i] = yalt_full_simple[ind_sub[:,i]]

knndists_sub = dist_sub[:, :NN]

In [52]:
ind_df = pd.DataFrame(ind_sub)
ind_df.to_csv("data/knn/inds_sub.csv", index = False, header = False)
    
recomms_df = pd.DataFrame(recomms_sub)
recomms_df.to_csv("data/knn/recomms_sub.csv", index = False, header = False)

dists_df = pd.DataFrame(knndists_sub)
dists_df.to_csv("data/knn/dists_sub.csv", index = False, header = False)