In [12]:
import random
import numpy as np
import pandas as pd
from datetime import timedelta
from dask import dataframe as dd
pd.options.mode.chained_assignment = None

train = False

In [13]:
transactions = dd.read_csv('data/transactions_train.csv', dtype={'article_id': int, 'customer_id': str})
if train:
    transactions = transactions[(transactions.t_dat >= '2020-03-17') & (transactions.t_dat <= '2020-09-15')]
    path = 'data/ensemble_train/'
else:
    path = 'data/ensemble/'
    transactions = transactions[transactions.t_dat >= '2020-03-24']
transactions.t_dat = dd.to_datetime(transactions.t_dat) - timedelta(2)
transactions["week"] = transactions.t_dat.dt.isocalendar().week
transactions.week = transactions.week.astype(int)
transactions = transactions.compute()
transactions["rebuy_count"] = transactions.groupby(["customer_id", "article_id"]).cumcount().astype(int)
transactions["rebuy_count"] = transactions.rebuy_count.apply(lambda x: x -1 if x > 0 else 0)

In [14]:
most_solds = transactions[transactions.week > transactions.week.max()-3].groupby(["article_id"]).agg({"article_id":"count"})\
                                   .rename(columns={"article_id":"_count"}).reset_index()\
                                   .sort_values('_count', ascending=False)
most_solds = most_solds.head(10000)
transactions = transactions[transactions.article_id.isin(most_solds.article_id)]
last_week_articles = transactions[transactions.week == transactions.week.max()].article_id.unique()
articles = pd.read_csv("data/articles.csv", dtype={'article_id': int})
articles.drop_duplicates(subset=['article_id'], inplace=True)
articles = articles[(articles.article_id.isin(most_solds.article_id)) & (articles.article_id.isin(last_week_articles))]
del most_solds, last_week_articles

In [15]:
active_customers = transactions.customer_id.unique()
customers = pd.read_csv("data/customers.csv", dtype={'customer_id': str})
customers.drop_duplicates(subset=['customer_id'], inplace=True)
customers = customers[customers.customer_id.isin(active_customers)]
del active_customers

In [16]:
def gender_classification(age):
    if age < 19:
        return 0
    elif age < 29:
        return 1
    elif age < 49:
        return 2
    elif age < 59:
        return 3
    elif age < 69:
        return 4
    else:
        return 5

customers["age"] = customers.age.fillna(np.mean(customers.age))
customers["age_bin"] = customers.age.map(gender_classification)

In [17]:
def gender_classification(section_name):
    if "womens" in section_name or "girl" in section_name or "ladies" in section_name:
        return "woman"
    elif "men" in section_name or "boy" in section_name or "boys" in section_name:
        return "man"
    else:
        return "other"

articles.section_name = articles.section_name.map(lambda x: x.lower())
articles["gender_group"] = articles.section_name.apply(gender_classification)

In [18]:
transactions = transactions.merge(articles[["article_id", "gender_group"]], on="article_id", how="inner")
transactions = transactions.merge(customers[["customer_id","age_bin"]], on="customer_id", how="inner")
customer_hist = transactions.groupby(by="customer_id").agg({"article_id": lambda x: list(x.values), "week": lambda x: list(x.values), "gender_group": lambda x : x.mode().iloc[0], "rebuy_count": "mean"}).reset_index()
customers = customers.merge(customer_hist, on="customer_id", how="left")
customers.article_id = customers.article_id.fillna("").apply(list)
customers.gender_group = customers.gender_group.fillna("other")
transactions.price = transactions.price.fillna(transactions.price.mean())
prod_price = transactions.groupby("customer_id").agg({"price":"mean"}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
del customer_hist

In [19]:
def sort_customer_hist(week, article_id):
    customer_hist = np.asarray(sorted(zip(week, article_id), reverse=True))
    return customer_hist[:,0], customer_hist[:,1]

customers[["week", "article_id"]] = customers.apply(lambda x: sort_customer_hist(x.week, x.article_id), axis=1, result_type="expand")

In [20]:
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.club_member_status = customers.club_member_status.fillna("no-info")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x)
customers["numberOfArticles"] = customers.apply(lambda x: len(x.article_id), axis=1)
customers = customers.drop(columns=["postal_code"])
customers.sort_values(by="customer_id",inplace=True)

In [21]:
def map_fashion_news(name):
    return 1 if name == 'not_regular' else 0
def map_club_member(name):
    return 1 if name == 'ACTIVE' else 0

customers.fashion_news_frequency = customers.fashion_news_frequency.map(map_fashion_news)
customers.club_member_status = customers.club_member_status.map(map_club_member)

In [22]:
def article2doc(x):
    def clean_doc(text):
        unwanted_chars = ['1','2','3','4','5','6','7','8','9','(',')','[',']']
        for chr in unwanted_chars:
            text = text.replace(chr, '')
        return text

    doc =  '. '.join([x.prod_name, x.product_type_name, x.product_group_name, x.graphical_appearance_name, x.colour_group_name,\
                      x.perceived_colour_value_name, x.perceived_colour_master_name, x.department_name, x.index_name, x.index_group_name,\
                      x.section_name, x.garment_group_name, str(x.detail_desc)])[:-1]
    return(clean_doc(doc.lower()))

articles["doc"] = articles.apply(article2doc, axis=1)
article_info = transactions.groupby(["article_id"]).agg({"price":"mean", "rebuy_count":"mean", "age_bin": lambda x : x.mode().iloc[0]}).reset_index()
articles = articles.merge(article_info, on="article_id", how="inner")
most_solds = transactions[transactions.week > transactions.week.max()-3].groupby(["article_id"]).agg({"customer_id":"count"})\
                                      .rename(columns={"customer_id":"prod_sold_count"}).reset_index()
articles = articles.merge(most_solds, on="article_id", how="inner")
articles = articles[["article_id","doc","gender_group", "price", "rebuy_count", "age_bin", "prod_sold_count"]]
weekly_sales = transactions.groupby(["article_id", "week"]).agg({"customer_id":"count"}).reset_index()
last_week_sales = weekly_sales[weekly_sales.week == weekly_sales.week.max()]
weekly_sales = weekly_sales.merge(last_week_sales[["article_id","customer_id"]], on=["article_id"], how="inner")
weekly_sales["quotient"] = weekly_sales.customer_id_y / weekly_sales.customer_id_x
articles = articles.merge(weekly_sales[["article_id","quotient"]], on="article_id", how="inner")

del article_info, most_solds, weekly_sales, last_week_sales

In [23]:
customers["doc"] = customers.article_id.apply(lambda x: list(set([i for i in x])))
temp_dict = {}
for i,row in articles.iterrows():
    temp_dict[row.article_id] = row.doc

customers["doc"] = customers.doc.apply(lambda x:  ". ".join([temp_dict[i] for i in x]))
del temp_dict

In [24]:
customers.head(3)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,age_bin,article_id,week,gender_group,rebuy_count,price,numberOfArticles,doc
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,1,1,49.0,3,[568601043],[36],woman,0.0,0.050831,1,mariette blazer. blazer. garment upper body. o...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,1,1,25.0,1,"[826211002, 811927004, 811927004, 811925005, 8...","[28, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]",woman,0.0,0.022017,11,lazer razer brief. swimwear bottom. swimwear. ...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,1,1,24.0,1,"[794321007, 852643003, 852643001]","[37, 16, 16]",man,0.0,0.037271,3,pez sweater. sweater. garment upper body. soli...


In [25]:
articles.head()

Unnamed: 0,article_id,doc,gender_group,price,rebuy_count,age_bin,prod_sold_count,quotient
0,108775044,strap top. vest top. garment upper body. solid...,woman,0.007934,0.096085,1,10,0.75
1,108775044,strap top. vest top. garment upper body. solid...,woman,0.007934,0.096085,1,10,1.5
2,108775044,strap top. vest top. garment upper body. solid...,woman,0.007934,0.096085,1,10,0.333333
3,108775044,strap top. vest top. garment upper body. solid...,woman,0.007934,0.096085,1,10,0.142857
4,108775044,strap top. vest top. garment upper body. solid...,woman,0.007934,0.096085,1,10,0.230769


## Content Based Corpus Data Creating

In [26]:
doc_df1 = articles[["article_id","doc"]].rename(columns={"article_id":"doc_id"}).copy()
articles.drop(columns=["doc"], inplace=True)
doc_df2 = customers[["customer_id","doc"]].rename(columns={"customer_id":"doc_id"}).copy()
customers.drop(columns=["doc"], inplace=True)
doc_df1["type"] = "product"
doc_df2["type"] = "customer"
doc_df = doc_df1.append(doc_df2)
doc_df.to_csv(path+"corpus.csv", index=False)

del doc_df, doc_df1, doc_df2

## Collabrative Data Creating

last_week = 38

In [27]:
transactions[["customer_id","article_id","week"]].to_pickle("data/transactions.pkl")
del transactions

In [28]:
customers.FN = customers.FN.astype('category').cat.codes
customers.Active = customers.Active.astype('category').cat.codes
customers.club_member_status = customers.club_member_status.astype('category').cat.codes
customers.fashion_news_frequency = customers.fashion_news_frequency.astype('category').cat.codes
customers.age_bin = customers.age_bin.astype('category').cat.codes
customers.gender_group = customers.gender_group.astype('category').cat.codes
customers.age = (customers.age - customers.age.min()) / (customers.age.max() - customers.age.min())
customers.numberOfArticles = (customers.numberOfArticles - customers.numberOfArticles.min()) / (customers.numberOfArticles.max() - customers.numberOfArticles.min())
customers = customers.drop(columns=["article_id", "week"])
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.reset_index().rename(columns = {'index':'customer_index'})
customers.to_pickle(path+"customers.pkl")
print("Customers saved for prediction...")

Customers saved for prediction...


In [29]:
articles.gender_group = articles.gender_group.astype('category').cat.codes
articles.age_bin = articles.age_bin.astype('category').cat.codes
articles.price = (articles.price - articles.price.min()) / (articles.price.max() - articles.price.min())
articles.rebuy_count = (articles.rebuy_count - articles.rebuy_count.min()) / (articles.rebuy_count.max() - articles.rebuy_count.min())
articles.prod_sold_count = (articles.prod_sold_count - articles.prod_sold_count.min()) / (articles.prod_sold_count.max() - articles.prod_sold_count.min())
articles.quotient = (articles.quotient - articles.quotient.min()) / (articles.quotient.max() - articles.quotient.min())
articles.drop_duplicates(subset=["article_id"], inplace=True)
articles.to_pickle(path+"articles.pkl")
print("Articles saved for prediction...")

Articles saved for prediction...


In [30]:
transactions = pd.read_pickle("data/transactions.pkl")
transactions.article_id = transactions.article_id.astype(np.int64)
transactions.week = transactions.week.astype(int)
customer_hist_info = transactions.groupby(["customer_id","article_id"]).agg({"article_id":"count", "week":["max","min"]}).reset_index()
customer_hist_info.columns = list(map(lambda x: ''.join(x), customer_hist_info.columns))
customer_hist_info["avg_purchase_time"] = customer_hist_info.apply(lambda x: x.weekmax - x.weekmin / (x.article_idcount-1) if x.article_idcount > 1 else 0, axis=1)
customer_hist_info = customer_hist_info.rename(columns={"article_idcount":"same_prod_rebuy_count", "weekmax":"time_passed_last_purchase"}).reset_index()
customer_hist_info = customer_hist_info.drop(columns=["weekmin", "index"])
customer_hist_info.drop_duplicates(subset=["customer_id"], inplace=True)
customer_hist_info.to_pickle(path+"customer_hist.pkl")
print("Customer history info saved for prediction...")

Customer history info saved for prediction...


## Ensemble Data Creating

In [None]:
import random
import numpy as np
import pandas as pd
from dask import dataframe as dd

path = 'data/ensemble_train/'

In [None]:
customers = pd.read_pickle("data/ensemble_train/customers.pkl")
customers.drop_duplicates(subset=["customer_id"], inplace=True)
articles = pd.read_pickle("data/ensemble_train/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble_train/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
last_week = customer_hist.time_passed_last_purchase.max() + 1

In [None]:
data = dd.read_csv('data/transactions_train.csv', dtype={'article_id': str, 'customer_id': str, 'article_id': int})[["customer_id", "article_id", "t_dat"]]
data = data[data.t_dat >= '2020-09-15'].compute()
data["label"] = 1.0
data.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
data.drop(columns=["t_dat"], inplace=True)
article_id_list = list(articles.article_id.values)
data = data[data.article_id.isin(article_id_list)]

In [None]:
group_info = data.groupby(["customer_id"]).agg({"article_id": lambda x: list(set(x))}).reset_index()
group_info["prod_num"] = group_info.article_id.apply(lambda x: len(x))
customer_id_list = data.customer_id.to_list()
prod_id_list = data.article_id.to_list()
label_list = data.label.to_list()
del data

In [None]:
for i, row in group_info.iterrows():
    temp_articles = random.sample(article_id_list, 550)
    step = 0
    for id in temp_articles:
        if id not in row.article_id:
            customer_id_list.append(row.customer_id)
            label_list.append(0.0)
            prod_id_list.append(id)
            step += 1
        if step >= 500:
            break
    print('\r' + f'{i}: %{round(100*i/group_info.shape[0], 2)}', end='')

data = pd.DataFrame({"customer_id": customer_id_list, "article_id": prod_id_list, "label": label_list, "week": last_week+1})
data = dd.from_pandas(data, npartitions= 32)
articles = dd.from_pandas(articles, npartitions= 4)
customers = dd.from_pandas(customers, npartitions= 4)
customer_hist = dd.from_pandas(customer_hist, npartitions= 4)
del label_list, prod_id_list, customer_id_list, group_info

In [None]:
data = data.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", \
                                       "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
data = data.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group",\
                                        "rebuy_count":"customer_rebuy_count","price":"customer_avg_price", \
                                        "article_id":"article_hist", "week":"week_hist"}), on="customer_id", how="inner")
data = data.merge(customer_hist, on=["customer_id","article_id"], how="left")
data.same_prod_rebuy_count = data.same_prod_rebuy_count.fillna(0)
data.avg_purchase_time = data.avg_purchase_time.fillna(0)
data.time_passed_last_purchase = data.time_passed_last_purchase.fillna(39 - 29) # 6 mounths is nearly 29 week
data.time_passed_last_purchase = data.apply(lambda x: x.week - x.time_passed_last_purchase, meta=("time_passed_last_purchase","int"), axis=1)

del articles, customers, customer_hist

In [None]:
data = data.drop(columns=["week"])
data = data.sort_values(by=["customer_id"]).reset_index(drop=True)
data.to_csv(path + "ensemble_data.csv")

## Prediction Data Creating

In [None]:
import cudf
import gc
import random
import numpy as np
import pandas as pd
from datetime import timedelta
from dask import dataframe as dd

In [None]:
customers = pd.read_pickle("data/ensemble/customers.pkl")
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers = customers.sort_values(by=["customer_id"])
articles = pd.read_pickle("data/ensemble/articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
customer_hist = pd.read_pickle("data/ensemble/customer_hist.pkl")
customer_hist.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)
article_ids = articles.article_id.values.tolist()
customer_ids = customers.customer_id.values
customers = cudf.DataFrame.from_pandas(customers)
customer_hist = cudf.DataFrame.from_pandas(customer_hist)
articles = cudf.DataFrame.from_pandas(articles)
batch_size = 512
article_ids = article_ids * batch_size

In [None]:
%%time
df_list = []
loop_size = len(customer_ids) + batch_size
for batch_i in range(batch_size, loop_size, batch_size):
    customer_ids_batch = customer_ids[0:batch_size]
    customer_ids_batch = np.repeat(customer_ids_batch, len(article_ids)/batch_size)
    df = cudf.DataFrame({"customer_id": customer_ids_batch, "article_id": article_ids, "week": 39})
    df = df.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", \
                                           "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
    df = df.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group",\
                                            "rebuy_count":"customer_rebuy_count","price":"customer_avg_price", \
                                            "article_id":"article_hist", "week":"week_hist"}), on="customer_id", how="inner")
    df = df.merge(customer_hist, on=["customer_id","article_id"], how="left")
    df.same_prod_rebuy_count = df.same_prod_rebuy_count.fillna(0)
    df.avg_purchase_time = df.avg_purchase_time.fillna(0)
    df.time_passed_last_purchase = df.time_passed_last_purchase.fillna(39 - 29) # 6 mounths is nearly 29 week
    df.time_passed_last_purchase = df.apply(lambda x: x.week - x.time_passed_last_purchase)
    df.to_csv(f"data/ensemble/final_data/{batch_i}.csv")
    del df
    gc.collect()
    print('\r' + f'{batch_i}: %{round(100*batch_i/loop_size, 2)}', end='')
print("\n")