In [1]:
import h5py
import json
import dask
import warnings
import numpy as np
import pandas as pd
import multiprocessing
import dask.array as da
import tensorflow as tf
from dask import dataframe as dd
from nltk.tokenize import word_tokenize
from dask_ml.wrappers import Incremental
from dask_ml.decomposition import TruncatedSVD
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
articles = dd.read_csv('data/articles.csv')
transactions = dd.read_csv('data/transactions_train.csv')
customers = dd.read_csv('data/customers.csv')
train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[df['_merge'] == 'left_only'][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[df['_merge'] == 'both'][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
map_test = test.groupby(["customer_id"])['article_id'].apply(lambda x: list(x), meta=("article_ids",object)).reset_index().compute().drop_duplicates(subset=["customer_id"])
print("Len of data: ", len(transactions))
del transactions

## Feature Engineering

In [None]:
# Filling NONE values
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.age = customers.age.fillna(customers.age.mean())
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x, meta=('fashion_news_frequency', 'object'))
train.price = train.price.fillna(train.price.mean())

In [None]:
# Basic statical information extraction
prod_count = train.groupby("customer_id").agg({"customer_id":"count"}).rename(columns={"customer_id":"bought_"}).reset_index()
customers = customers.merge(prod_count, on="customer_id", how="inner")
prod_price = train.groupby("customer_id").agg({"price":["mean", "std"]}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
customers.head()

In [None]:
# Creating customer last 64 order history
def get_purchase_history(x):
    purchase_history = zip(x["t_dat"], x["article_id"])
    purchase_history_in_order = sorted(purchase_history, key=lambda i: i[0],reverse=True)[:64]
    return [i[1] for i in purchase_history_in_order]


purchase_history = train.groupby(["customer_id"]).apply(get_purchase_history, meta=("article_ids",object)).reset_index().compute().drop_duplicates("customer_id")

In [None]:
# Converting articles to document
articles = dd.read_csv('data/articles.csv')
def article2doc(x):
    def clean_doc(text):
        unwanted_chars = ['1','2','3','4','5','6','7','8','9','(',')','[',']']
        for chr in unwanted_chars:
            text = text.replace(chr, '')
        return text

    doc =  '. '.join([x.prod_name, x.product_type_name, x.product_group_name, x.graphical_appearance_name, x.colour_group_name,\
                      x.perceived_colour_value_name, x.perceived_colour_master_name, x.department_name, x.index_name, x.index_group_name,\
                      x.section_name, x.garment_group_name, str(x.detail_desc)])[:-1]
    return(clean_doc(doc))

articles["doc"] = articles.apply(article2doc, axis=1, meta=("doc","object"))

In [None]:
# Converting customer purchase histroy and product features to document
prod_dict = {}
for i,row in articles.iterrows():
    prod_dict[row.article_id] = row.doc

customers_dict = {}
for i, row in purchase_history.iterrows():
    customers_dict[row.customer_id] = " ".join(map(lambda x: prod_dict[x], row.article_ids))

prod_doc_df = pd.DataFrame({"id":prod_dict.keys(), "doc": prod_dict.values(), "type":"product"})
customer_doc_df = pd.DataFrame({"id":customers_dict.keys(), "doc": customers_dict.values(), "type":"customer"})
doc_df = pd.concat([prod_doc_df, customer_doc_df])
doc_df.to_csv("data/corpus.csv", index=False)

del prod_dict, customers_dict, prod_doc_df, customer_doc_df, articles

## Demographic Score

In [None]:
prod_sell_counts = train.groupby("article_id").size().reset_index().rename(columns={0:"sell_score"})
train = train.merge(prod_sell_counts, on="article_id",how="inner")
train.sell_score = (train.sell_score - train.sell_score.min()) / (train.sell_score.max() - train.sell_score.min())
train.head()

## Content Based Filtering

In [None]:
corpus = pd.read_csv("data/corpus.csv")
indexes = corpus[corpus.type == "product"].index.tolist()
ids = corpus[corpus.type == "product"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/prod_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
indexes = corpus[corpus.type == "customer"].index.tolist()
ids = corpus[corpus.type == "customer"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/customer_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
del corpus, indexes, ids, id_dict

In [None]:
# doc2vec
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
Documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]
epochs = 10
cores = multiprocessing.cpu_count()

model= Doc2Vec(dm=0,
               vector_size=256,
               negative=5,
               hs=0,
               min_count=2,
               sample = 0,
               workers=cores)

model.build_vocab(Documents)
for epoch in range(epochs):
    print('iteration {0}'.format(epoch),end = "\r")
    model.train(Documents,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("model_data/d2v.model")
print("Model Saved.")

## Content based Doc2Vec results

In [None]:
model = Doc2Vec.load("model_data/d2v.model")
model_vectors = model.dv.get_normed_vectors()
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = list(json.load(f).values())
    prod_d2v_gpu = tf.convert_to_tensor(model_vectors[prod_ids])
    customer_d2v_gpu = tf.convert_to_tensor(np.delete(model_vectors, prod_ids,axis=0))

In [None]:
customer_id_start = 105542
step = 0
df_list = []
for i, customer_vec in enumerate(customer_d2v_gpu):
    values, indices  = tf.math.top_k(tf.losses.cosine_similarity(customer_d2v_gpu[0], prod_d2v_gpu), k=150)
    df_list.append(pd.DataFrame({"customer_id": i + customer_id_start,
                                "prod_ids":indices,
                                "similarities":values}))
    if i % 25000 == 0 and i != 0:
        df = pd.concat(df_list)
        df.to_csv(f"model_data/d2v_sims/{step}.csv", index=False)
        step += 1
        df_list = []

    print('\r' + f'{i}: %{round(100*i/customer_d2v_gpu.shape[0], 2)}', end='')

## Content based TfIdf results

In [None]:
# tf-idf word level implementation //its run about 11 minutes
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
vectorizer = TfidfVectorizer(lowercase=True,max_features=8192, dtype=np.float32)
model_vectors = vectorizer.fit_transform(corpus)

with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = list(json.load(f).values())
    prod_tf_gpu = tf.convert_to_tensor(model_vectors[prod_ids].toarray())
    del prod_ids

with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = list(json.load(f).values())
    customer_tf = model_vectors[customer_ids].toarray()

In [None]:
customer_id_start = 105542
step = 0
df_list = []
for i, customer_vec in enumerate(customer_d2v_gpu):
    customer_vec = tf.convert_to_tensor(customer_vec)
    values, indices  = tf.math.top_k(tf.losses.cosine_similarity(customer_d2v_gpu[0], prod_d2v_gpu), k=150)
    df_list.append(pd.DataFrame({"customer_id": i + customer_id_start,
                                "prod_ids":indices,
                                "similarities":values}))
    if i % 25000 == 0 and i != 0:
        df = pd.concat(df_list)
        df.to_csv(f"model_data/tf_idf_sims/{step}.csv", index=False)
        step += 1
        df_list = []

    print('\r' + f'{i}: %{round(100*i/customer_d2v_gpu.shape[0], 2)}', end='')

In [None]:
transactions = dd.read_csv('data/transactions_train.csv')[["customer_id","article_id"]]
corpus = pd.read_csv("data/corpus.csv")
customer_ids = corpus[corpus.type == "customer"].id.tolist()
del corpus
with open("model_data/prod_id_dict.json", "r") as f:
    prod_dict = json.load(f)

number_of_products = len(prod_dict.keys())
transactions.article_id = transactions.article_id.apply(lambda x: prod_dict[str(x)], meta=("article_id","str"))

In [None]:
# Creating customer last 64 order history
def get_purchase_history(x):
    return list(set(x.article_id))

purchase_matrix = transactions.groupby("customer_id").apply(get_purchase_history,meta=("history","object"))
purchase_matrix_df = purchase_matrix.compute()
del prod_dict

In [None]:
A = [np.zeros(50), np.zeros(50), np.zeros(50)]  # each arrX is a numpy array
f = h5py.File('model_data/correleation_data.h5', 'w', libver='latest')
step = 30000
number_of_customer = purchase_matrix_df.size
dset = f.create_dataset("matrix", (number_of_customer, number_of_products), dtype=np.int16, compression='gzip')

customer_data = []
for i, ids in enumerate(purchase_matrix_df):
    if i%step == 0 and i !=0:
        customer_data = np.asarray(customer_data, dtype=np.int16)
        dset[i - customer_data.shape[0]:i] = customer_data
        customer_data = []

    temp = np.zeros(number_of_products, dtype=np.int16)
    temp[ids] = 1
    customer_data.append(temp)
    print('\r' + f'{i}: %{round(100*i/number_of_customer, 2)}', end='')

f.close()

## Decomposition filtering