In [None]:
import json
import warnings
import numpy as np
import pandas as pd
import multiprocessing
from dask import dataframe as dd
from sklearn.decomposition import NMF
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
articles = dd.read_csv('data/articles.csv')
transactions = dd.read_csv('data/transactions_train.csv')
customers = dd.read_csv('data/customers.csv')
train, test = transactions.random_split([0.9, 0.1], random_state=43)
transactions["predictions"] = ''
submission = transactions[["customer_id","predictions"]].drop_duplicates(subset=["customer_id"]).compute()
map_test = test.groupby(["customer_id"])['article_id'].apply(lambda x: list(x), meta=("article_ids",object)).reset_index().compute().drop_duplicates(subset=["customer_id"])
print("Len of data: ", len(transactions))
del transactions

In [None]:
def top_k_score(predics, actual, k=12):
    scores = []
    actual = actual[:k]
    for i,pred in enumerate(predics):
        hit = actual.count(pred)
        if hit > 0:
            scores.append(hit / (i+1.0))
    
    if len(scores) > 0:
        return np.sum(scores) / len(set(actual))
    return 0

## Feature Engineering

In [None]:
# Filling NONE values
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.age = customers.age.fillna(customers.age.mean())
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x, meta=('fashion_news_frequency', 'object'))
train.price = train.price.fillna(train.price.mean())

In [None]:
# Basic statical information extraction
prod_count = train.groupby("customer_id").agg({"customer_id":"count"}).rename(columns={"customer_id":"bought_"}).reset_index()
customers = customers.merge(prod_count, on="customer_id", how="inner")
prod_price = train.groupby("customer_id").agg({"price":["mean", "std"]}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
customers.head()

In [None]:
# Creating customer last 64 order history
def get_purchase_history(x):
    purchase_history = zip(x["t_dat"], x["article_id"])
    purchase_history_in_order = sorted(purchase_history, key=lambda i: i[0],reverse=True)[:64]
    return [i[1] for i in purchase_history_in_order]


purchase_history = train.groupby(["customer_id"]).apply(get_purchase_history, meta=("article_ids",object)).reset_index().compute().drop_duplicates("customer_id")

In [None]:
# Converring articles to document
articles = dd.read_csv('data/articles.csv')
def article2doc(x):
    def clean_doc(text):
        unwanted_chars = ['1','2','3','4','5','6','7','8','9','(',')','[',']']
        for chr in unwanted_chars:
            text = text.replace(chr, '')
        return text

    doc =  '. '.join([x.prod_name, x.product_type_name, x.product_group_name, x.graphical_appearance_name, x.colour_group_name,\
                      x.perceived_colour_value_name, x.perceived_colour_master_name, x.department_name, x.index_name, x.index_group_name,\
                      x.section_name, x.garment_group_name, str(x.detail_desc)])[:-1]
    return(clean_doc(doc))

articles["doc"] = articles.apply(article2doc, axis=1, meta=("doc","object"))

## Demographic Score

In [None]:
prod_sell_counts = train.groupby("article_id").size().reset_index().rename(columns={0:"sell_score"})
train = train.merge(prod_sell_counts, on="article_id",how="inner")
train.sell_score = (train.sell_score - train.sell_score.min()) / (train.sell_score.max() - train.sell_score.min())
train.head()

## Content Based Filtering

In [None]:
# Converting customer puchase histroy and product features to document
prod_dict = {}
for i,row in articles.iterrows():
    prod_dict[row.article_id] = row.doc

customers_dict = {}
for i, row in purchase_history.iterrows():
    customers_dict[row.customer_id] = " ".join(map(lambda x: prod_dict[x], row.article_ids))

prod_doc_df = pd.DataFrame({"id":prod_dict.keys(), "doc": prod_dict.values(), "type":"product"})
customer_doc_df = pd.DataFrame({"id":customers_dict.keys(), "doc": customers_dict.values(), "type":"customer"})
doc_df = pd.concat([prod_doc_df, customer_doc_df])
doc_df.to_csv("data/corpus.csv", index=False)

del prod_dict, customers_dict, prod_doc_df, customer_doc_df

corpus = pd.read_csv("data/corpus.csv")
indexes = corpus[corpus.type == "product"].index.tolist()
ids = corpus[corpus.type == "product"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/prod_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
indexes = corpus[corpus.type == "product"].index.tolist()
ids = corpus[corpus.type == "product"].id.tolist()
id_dict = dict(zip(ids, indexes))
with open('model_data/customer_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
del corpus, indexes, ids, id_dict

In [None]:
# tf-idf word level implementation //its run about 11 minutes
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
vectorizer = TfidfVectorizer(lowercase=True,max_features=8192)
tf_idf = vectorizer.fit_transform(corpus)
del corpus
with open('model_data/tf_idf.npy', 'wb') as f:
    np.save(f, tf_idf.toarray())

In [None]:
# doc2vec
corpus = pd.read_csv('data/corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
Documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]
epochs = 10
cores = multiprocessing.cpu_count()

model= Doc2Vec(dm=0,
               vector_size=256,
               negative=5, 
               hs=0, 
               min_count=2, 
               sample = 0, 
               workers=cores)
  
model.build_vocab(Documents)
for epoch in range(epochs):
    print('iteration {0}'.format(epoch),end = "\r")
    model.train(Documents,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("model_data/d2v.model")
print("Model Saved.")

In [None]:
import numpy as np
X = np.array([[1, 1, 0, 0],
              [2, 0, 1, 0],
              [3, 0, 1, 0],
              [4, 0, 1, 0],
              [5, 0, 1, 0],
              [0, 1, 0, 0]])

model = NMF(n_components=3, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [None]:
import numpy as np
X = np.array([[1, 1, 0, 0],
              [2, 0, 1, 0],
              [3, 0, 1, 0],
              [4, 0, 1, 0],
              [5, 0, 1, 0],
              [0, 1, 0, 0]])

model = NMF(n_components=3, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

transactions = dd.read_csv('data/transactions_train.csv')[["customer_id","article_id"]]
corpus = pd.read_csv("data/corpus.csv")
customer_ids = corpus[corpus.type == "customer"].id.tolist()
del corpus
with open("model_data/prod_id_dict.json", "r") as f:
    prod_dict = json.load(f)

number_of_prdoducts = len(prod_dict.keys())
transactions.article_id = transactions.article_id.apply(lambda x: prod_dict[str(x)], meta=("article_id","str"))


# Creating customer last 64 order history
def get_purchase_history(x):
    customer_hist = np.zeros(number_of_prdoducts, dtype=np.int16)
    for i in x.article_id:
        customer_hist[int(i)] = customer_hist[int(i)] + 1

    return customer_hist


purchase_matrix = transactions.groupby("customer_id").apply(get_purchase_history,meta=("history","object"))
purchase_matrix_df = purchase_matrix.compute()