In [137]:
import h5py
import json
import dask
import warnings
import numpy as np
import pandas as pd
import multiprocessing
import dask.array as da
import tensorflow as tf
from dask import dataframe as dd
from scipy import sparse
from nltk.tokenize import word_tokenize
from dask_ml.wrappers import Incremental
from dask_ml.decomposition import TruncatedSVD
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
articles = dd.read_csv('data/articles.csv')
transactions = dd.read_csv('data/transactions_train.csv')
customers = dd.read_csv('data/customers.csv')
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[(df._merge == 'both') &( df.prod_number > 1)][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
print("Len of data: ", len(transactions))
del transactions

## Feature Engineering

In [None]:
# Filling NONE values
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.age = customers.age.fillna(customers.age.mean())
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x, meta=('fashion_news_frequency', 'object'))
train.price = train.price.fillna(train.price.mean())

In [None]:
# Basic statical information extraction
prod_count = train.groupby("customer_id").agg({"customer_id":"count"}).rename(columns={"customer_id":"bought_"}).reset_index()
customers = customers.merge(prod_count, on="customer_id", how="inner")
prod_price = train.groupby("customer_id").agg({"price":["mean", "std"]}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
customers.head()

In [None]:
# Creating customer last 64 order history
def get_purchase_history(x):
    purchase_history = zip(x["t_dat"], x["article_id"])
    purchase_history_in_order = sorted(purchase_history, key=lambda i: i[0],reverse=True)[:64]
    return [i[1] for i in purchase_history_in_order]


purchase_history = train.groupby(["customer_id"]).apply(get_purchase_history, meta=("article_ids",object)).reset_index().compute().drop_duplicates("customer_id")

In [None]:
# Converting articles to document
articles = dd.read_csv('data/articles.csv')
def article2doc(x):
    def clean_doc(text):
        unwanted_chars = ['1','2','3','4','5','6','7','8','9','(',')','[',']']
        for chr in unwanted_chars:
            text = text.replace(chr, '')
        return text

    doc =  '. '.join([x.prod_name, x.product_type_name, x.product_group_name, x.graphical_appearance_name, x.colour_group_name,\
                      x.perceived_colour_value_name, x.perceived_colour_master_name, x.department_name, x.index_name, x.index_group_name,\
                      x.section_name, x.garment_group_name, str(x.detail_desc)])[:-1]
    return(clean_doc(doc))

articles["doc"] = articles.apply(article2doc, axis=1, meta=("doc","object"))

In [None]:
# Converting customer purchase histroy and product features to document
prod_dict = {}
for i,row in articles.iterrows():
    prod_dict[row.article_id] = row.doc

customers_dict = {}
for i, row in purchase_history.iterrows():
    customers_dict[row.customer_id] = " ".join(map(lambda x: prod_dict[x], row.article_ids))

prod_doc_df = pd.DataFrame({"id":prod_dict.keys(), "doc": prod_dict.values(), "type":"product"})
customer_doc_df = pd.DataFrame({"id":customers_dict.keys(), "doc": customers_dict.values(), "type":"customer"})
doc_df = pd.concat([prod_doc_df, customer_doc_df])
doc_df.to_csv("data/corpus.csv", index=False)

del prod_dict, customers_dict, prod_doc_df, customer_doc_df, articles

## Demographic Score

In [None]:
prod_sell_counts = train.groupby("article_id").size().reset_index().rename(columns={0:"sell_score"})
prod_sell_counts.sell_score = (prod_sell_counts.sell_score - prod_sell_counts.sell_score.min()) / (prod_sell_counts.sell_score.max() - prod_sell_counts.sell_score.min())
prod_sell_counts.to_csv("model_data/demographic/scores.csv", index=False)

## Content Based Doc2Vec

In [None]:
corpus = pd.read_csv("data/corpus.csv")
indexes = corpus[corpus.type == "product"].index.tolist()
ids = corpus[corpus.type == "product"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/prod_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
indexes = corpus[corpus.type == "customer"].index.tolist()
ids = corpus[corpus.type == "customer"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/customer_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
del corpus, indexes, ids, id_dict

In [None]:
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
Documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

In [None]:
# doc2vec
epochs = 50
cores = multiprocessing.cpu_count()

model= Doc2Vec(dm=0,
               vector_size=512,
               negative=5,
               hs=0,
               min_count=2,
               sample = 0,
               workers=cores)

model.build_vocab(Documents)
for epoch in range(epochs):
    print('iteration {0}'.format(epoch),end = "\r")
    model.train(Documents,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("model_data/d2v.model")
print("Model Saved.")

## Preparing purchase history

In [None]:
transactions = dd.read_csv('data/transactions_train.csv')
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")
train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
print("Len of data: ", len(transactions))
del transactions, test

In [None]:
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = json.load(f)
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = json.load(f)

In [97]:
customer_hist = train.groupby("customer_id").article_id.apply(lambda x: list(map(lambda i: str(int(i)), set(x))), meta=('history',object)).reset_index().compute()
customer_hist.customer_id = customer_hist.customer_id.map(lambda x: customer_ids[x])
customer_hist.history = customer_hist.history.map(lambda x: [prod_ids[i] for i in x])
customer_hist["customer_map"] = customer_hist.customer_id - customer_hist.customer_id.min()
customer_hist.sort_values(by="customer_map", inplace=True)
customer_hist = customer_hist.reset_index()[["customer_id", "history"]]

In [199]:
model = Doc2Vec.load("model_data/d2v.model")
model_vectors = model.dv.get_normed_vectors()
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = list(json.load(f).values())
    prod_d2v_gpu = tf.nn.l2_normalize(tf.convert_to_tensor(model_vectors[prod_ids]))
    customer_d2v = tf.convert_to_tensor(np.delete(model_vectors, prod_ids,axis=0))
del model, model_vectors

In [None]:
sparse_history = sparse.lil_matrix((customer_hist.shape[0], len(prod_ids)), dtype=int)
for i, row in customer_hist.iterrows():
    sparse_history[0, row.history] = -1

In [200]:
batch = 512
step = 15000 // batch
f = h5py.File('model_data/content/d2v.h5', 'w', libver='latest')
dset = f.create_dataset("d2v", (customer_d2v.shape[0], 2, 150), dtype=np.float32, compression='gzip')

ptr = 0
score_temp = []
indices_temp = []
for i, batch_i in enumerate(range(0, customer_d2v.shape[0], batch)):
    customer_batch = tf.nn.l2_normalize(customer_d2v[batch_i:batch_i+batch], 1)
    batch_distances = tf.matmul(customer_batch, prod_d2v_gpu, transpose_b=True)
    batch_distances = tf.convert_to_tensor(sparse_history[batch_i:batch_i+batch].toarray(), dtype=tf.float32) + batch_distances
    for distance in batch_distances:
        values, indices = tf.math.top_k(distance, k=150)
        score_temp.append(values)
        indices_temp.append(tf.cast(indices, tf.float32))

    if i != 0 and (i % step == 0 or i == (customer_d2v.shape[0] // batch)):
        score_temp = np.asarray(score_temp, dtype=np.float32)
        indices_temp = np.asarray(indices_temp, dtype=np.float32)
        dset[ptr:ptr+score_temp.shape[0],1,:] = score_temp
        dset[ptr:ptr+indices_temp.shape[0],0,:] = indices_temp
        ptr = batch_i+batch
        score_temp = []
        indices_temp = []

    print('\r' + f'{i*batch}: %{round(100*i*batch/customer_d2v.shape[0], 2)}', end='')
f.close()
del customer_d2v, prod_d2v_gpu

1361920: %99.97

## Content based TfIdf results

In [None]:
# tf-idf word level implementation //its run about 11 minutes
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
vectorizer = TfidfVectorizer(lowercase=True,max_features=8192, dtype=np.float32)
model_vectors = vectorizer.fit_transform(corpus)

del corpus
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = list(json.load(f).values())
    prod_tf_gpu = tf.convert_to_tensor(model_vectors[prod_ids].toarray())
    del prod_ids
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = list(json.load(f).values())
    customer_tf = model_vectors[customer_ids].toarray()
    del customer_ids, model_vectors

In [None]:
batch = 512
step = 15000 // batch
f.close()
f = h5py.File('model_data/content/tf_idf.h5', 'w', libver='latest')
dset = f.create_dataset("tf_idf", (customer_tf.shape[0], 2, 150), dtype=np.float32, compression='gzip')

ptr = 0
score_temp = []
indices_temp = []
for i, batch_i in enumerate(range(0, customer_tf.shape[0], batch)):
    customer_batch = tf.convert_to_tensor(customer_tf[batch_i:batch_i+batch])
    batch_distances = tf.matmul(customer_batch, prod_tf_gpu, transpose_b=True)
    for distance in batch_distances:
        values, indices = tf.math.top_k(distance, k=150)
        score_temp.append(values)
        indices_temp.append(tf.cast(indices, tf.float32))

    if i != 0 and (i % step == 0 or i == (customer_tf.shape[0] // batch)):
        score_temp = np.asarray(score_temp, dtype=np.float32)
        indices_temp = np.asarray(indices_temp, dtype=np.float32)
        dset[ptr:ptr+score_temp.shape[0],1,:] = score_temp
        dset[ptr:ptr+indices_temp.shape[0],0,:] = indices_temp
        ptr = batch_i+batch
        score_temp = []
        indices_temp = []
    print('\r' + f'{i*batch}: %{round(100*i*batch/customer_tf.shape[0], 2)}', end='')
f.close()
del customer_tf, prod_tf_gpu

## Content Based Results

In [None]:
results = h5py.File('model_data/content/tf_idf.h5', 'r')["tf_idf"]
submission = pd.read_csv('data/sample_submission.csv', index_col=False)
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = json.load(f)
    customer_ids = {k:v-105542 for k,v in customer_ids.items()}
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = json.load(f)
    prod_ids = {v:k for k,v in prod_ids.items()}

In [None]:
predictions = []
results = np.asanyarray(results)
for i, row in submission.iterrows():
    if row.customer_id in customer_ids:
        prods = results[customer_ids[row.customer_id],0]
        predictions.append(' '.join([prod_ids[int(i)] for i in prods[:12]]))
    else:
        predictions.append(row.prediction)
    print('\r' + f'{i}: %{round(100*i/submission.shape[0], 2)}', end='')
    
submission["prediction"] = predictions
submission.to_csv('submission.csv', index=False)

In [201]:
results = h5py.File('model_data/content/d2v.h5', 'r')["d2v"]
submission = pd.read_csv('data/sample_submission.csv', index_col=False)
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = json.load(f)
    customer_ids = {k:v-105542 for k,v in customer_ids.items()}
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = json.load(f)
    prod_ids = {v:k for k,v in prod_ids.items()}

In [202]:
predictions = []
results = np.asanyarray(results)
for i, row in submission.iterrows():
    if row.customer_id in customer_ids:
        prods = results[customer_ids[row.customer_id],0]
        predictions.append(' '.join([prod_ids[int(i)] for i in prods[:12]]))
    else:
        predictions.append(row.prediction)
    print('\r' + f'{i}: %{round(100*i/submission.shape[0], 2)}', end='')
    
submission["prediction"] = predictions
submission.to_csv('submission.csv', index=False)

1371979: %100.0