In [1]:
import random
import numpy as np
import pandas as pd
from datetime import timedelta
from dask import dataframe as dd
pd.options.mode.chained_assignment = None

train = True

In [2]:
transactions = dd.read_csv('data/transactions_train.csv', dtype={'article_id': str, 'customer_id': str})
if train:
    transactions = transactions[(transactions.t_dat >= '2020-03-17') & (transactions.t_dat <= '2020-09-15')]
    path = 'data/ensemble_train/'
else:
    path = 'data/ensemble/'
    transactions = transactions[transactions.t_dat >= '2020-03-24']
transactions.t_dat = dd.to_datetime(transactions.t_dat) - timedelta(2)
transactions["week"] = transactions.t_dat.dt.isocalendar().week
transactions = transactions.compute()
transactions["rebuy_count"] = transactions.groupby(["customer_id", "article_id"]).cumcount().astype(int)
transactions["rebuy_count"] = transactions.rebuy_count.apply(lambda x: x -1 if x > 0 else 0)

In [3]:
most_solds = transactions[transactions.week > transactions.week.max()-3].groupby(["article_id"]).agg({"article_id":"count"})\
                                   .rename(columns={"article_id":"_count"}).reset_index()\
                                   .sort_values('_count', ascending=False)
most_solds = most_solds.head(10000)
transactions = transactions[transactions.article_id.isin(most_solds.article_id)]
last_week_articles = transactions[transactions.week == transactions.week.max()].article_id.unique()
articles = pd.read_csv("data/articles.csv", dtype={'article_id': str})
articles = articles[(articles.article_id.isin(most_solds.article_id)) & (articles.article_id.isin(last_week_articles))]
del most_solds, last_week_articles

In [4]:
active_customers = transactions.customer_id.unique()
customers = pd.read_csv("data/customers.csv", dtype={'customer_id': str})
customers = customers[customers.customer_id.isin(active_customers)]
del active_customers

In [5]:
def gender_classification(age):
    if age < 19:
        return 0
    elif age < 29:
        return 1
    elif age < 49:
        return 2
    elif age < 59:
        return 3
    elif age < 69:
        return 4
    else:
        return 5

customers["age"] = customers.age.fillna(np.mean(customers.age))
customers["age_bin"] = customers.age.map(gender_classification)

In [6]:
def gender_classification(section_name):
    if "womens" in section_name or "girl" in section_name or "ladies" in section_name:
        return "woman"
    elif "men" in section_name or "boy" in section_name or "boys" in section_name:
        return "man"
    else:
        return "other"

articles.section_name = articles.section_name.map(lambda x: x.lower())
articles["gender_group"] = articles.section_name.apply(gender_classification)

In [7]:
transactions = transactions.merge(articles[["article_id", "gender_group"]], on="article_id", how="inner")
transactions = transactions.merge(customers[["customer_id","age_bin"]], on="customer_id", how="inner")
customer_hist = transactions.groupby(by="customer_id").agg({"article_id": lambda x: list(x.values), "week": lambda x: list(x.values), "gender_group": lambda x : x.mode().iloc[0], "rebuy_count": "mean"}).reset_index()
customers = customers.merge(customer_hist, on="customer_id", how="left")
customers.article_id = customers.article_id.fillna("").apply(list)
customers.gender_group = customers.gender_group.fillna("other")
transactions.price = transactions.price.fillna(transactions.price.mean())
prod_price = transactions.groupby("customer_id").agg({"price":"mean"}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
del customer_hist

In [8]:
customers["history"] = customers.apply(lambda x: sorted(zip(x.week, x.article_id)), axis=1)
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.club_member_status = customers.club_member_status.fillna("no-info")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x)
customers["numberOfArticles"] = customers.apply(lambda x: len(x.history), axis=1)
customers = customers.drop(columns=["postal_code", "week", "article_id"])

In [9]:
def map_fashion_news(name):
    return 1 if name == 'not_regular' else 0
def map_club_member(name):
    return 1 if name == 'ACTIVE' else 0

customers.fashion_news_frequency = customers.fashion_news_frequency.map(map_fashion_news)
customers.club_member_status = customers.club_member_status.map(map_club_member)

In [10]:
def article2doc(x):
    def clean_doc(text):
        unwanted_chars = ['1','2','3','4','5','6','7','8','9','(',')','[',']']
        for chr in unwanted_chars:
            text = text.replace(chr, '')
        return text

    doc =  '. '.join([x.prod_name, x.product_type_name, x.product_group_name, x.graphical_appearance_name, x.colour_group_name,\
                      x.perceived_colour_value_name, x.perceived_colour_master_name, x.department_name, x.index_name, x.index_group_name,\
                      x.section_name, x.garment_group_name, str(x.detail_desc)])[:-1]
    return(clean_doc(doc.lower()))

articles["doc"] = articles.apply(article2doc, axis=1)
article_info = transactions.groupby(["article_id"]).agg({"price":"mean", "rebuy_count":"mean", "age_bin": lambda x : x.mode().iloc[0]}).reset_index()
articles = articles.merge(article_info, on="article_id", how="inner")
most_solds = transactions[transactions.week > transactions.week.max()-3].groupby(["article_id"]).agg({"customer_id":"count"})\
                                      .rename(columns={"customer_id":"prod_sold_count"}).reset_index()
articles = articles.merge(most_solds, on="article_id", how="inner")
articles = articles[["article_id","doc","gender_group", "price", "rebuy_count", "age_bin", "prod_sold_count"]]
weekly_sales = transactions.groupby(["article_id", "week"]).agg({"customer_id":"count"}).reset_index()
last_week_sales = weekly_sales[weekly_sales.week == weekly_sales.week.max()]
weekly_sales = weekly_sales.merge(last_week_sales[["article_id","customer_id"]], on=["article_id"], how="inner")
weekly_sales["quotient"] = weekly_sales.customer_id_y / weekly_sales.customer_id_x
articles = articles.merge(weekly_sales[["article_id","quotient"]], on="article_id", how="inner")

del article_info, most_solds, weekly_sales, last_week_sales

In [11]:
customers["doc"] = customers.history.apply(lambda x: list(set([i[1] for i in x])))
temp_dict = {}
for i,row in articles.iterrows():
    temp_dict[row.article_id] = row.doc

customers["doc"] = customers.doc.apply(lambda x:  ". ".join([temp_dict[i] for i in x]))
del temp_dict

In [12]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,age_bin,gender_group,rebuy_count,price,history,numberOfArticles,doc
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,1,1,49.0,3,woman,0.0,0.03261,"[(12, 0795440001), (36, 0568601043)]",2,mariette blazer. blazer. garment upper body. o...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,1,1,25.0,1,woman,0.2,0.024277,"[(17, 0351484002), (17, 0559630026), (17, 0599...",15,timeless midrise brief. swimwear bottom. swimw...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,1,1,24.0,1,man,0.0,0.037271,"[(16, 0852643001), (16, 0852643003), (37, 0794...",3,pez sweater. sweater. garment upper body. soli...
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,1,0,52.0,3,woman,0.0,0.03422,"[(23, 0818320001), (33, 0730683050), (33, 0791...",5,amelie. t-shirt. garment upper body. solid. bl...
4,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0.0,0.0,1,1,20.0,1,other,0.0,0.038119,"[(37, 0448509014), (37, 0719530003)]",2,maja hw woventrs. trousers. garment lower bod...


In [13]:
articles.head()

Unnamed: 0,article_id,doc,gender_group,price,rebuy_count,age_bin,prod_sold_count,quotient
0,108775044,strap top. vest top. garment upper body. solid...,woman,0.007931,0.096774,1,15,3.0
1,108775044,strap top. vest top. garment upper body. solid...,woman,0.007931,0.096774,1,15,0.75
2,108775044,strap top. vest top. garment upper body. solid...,woman,0.007931,0.096774,1,15,1.5
3,108775044,strap top. vest top. garment upper body. solid...,woman,0.007931,0.096774,1,15,0.333333
4,108775044,strap top. vest top. garment upper body. solid...,woman,0.007931,0.096774,1,15,0.142857


## Content Based Corpus Data Creating

In [14]:
doc_df1 = articles[["article_id","doc"]].rename(columns={"article_id":"doc_id"}).copy()
articles.drop(columns=["doc"], inplace=True)
doc_df2 = customers[["customer_id","doc"]].rename(columns={"customer_id":"doc_id"}).copy()
customers.drop(columns=["doc"], inplace=True)
doc_df1["type"] = "product"
doc_df2["type"] = "customer"
doc_df = doc_df1.append(doc_df2)
doc_df.to_csv(path+"corpus.csv", index=False)

del doc_df, doc_df1, doc_df2

## Collabrative Data Creating

In [15]:
transactions[["customer_id","article_id","week"]].to_pickle("data/transactions.pkl")
last_week = transactions.week.max()

In [16]:
customers.FN = customers.FN.astype('category').cat.codes
customers.Active = customers.Active.astype('category').cat.codes
customers.club_member_status = customers.club_member_status.astype('category').cat.codes
customers.fashion_news_frequency = customers.fashion_news_frequency.astype('category').cat.codes
customers.age_bin = customers.age_bin.astype('category').cat.codes
customers.gender_group = customers.gender_group.astype('category').cat.codes
customers.age = (customers.age - customers.age.min()) / (customers.age.max() - customers.age.min())
customers.numberOfArticles = (customers.numberOfArticles - customers.numberOfArticles.min()) / (customers.numberOfArticles.max() - customers.numberOfArticles.min())
if train != True:
    customers.to_pickle("data/customers.pkl")

In [17]:
articles.gender_group = articles.gender_group.astype('category').cat.codes
articles.age_bin = articles.age_bin.astype('category').cat.codes
articles.price = (articles.price - articles.price.min()) / (articles.price.max() - articles.price.min())
articles.rebuy_count = (articles.rebuy_count - articles.rebuy_count.min()) / (articles.rebuy_count.max() - articles.rebuy_count.min())
articles.prod_sold_count = (articles.prod_sold_count - articles.prod_sold_count.min()) / (articles.prod_sold_count.max() - articles.prod_sold_count.min())
articles.quotient = (articles.quotient - articles.quotient.min()) / (articles.quotient.max() - articles.quotient.min())
if train != True:
    articles.to_pickle("data/articles.pkl")

## Ensemble Data Creating

In [18]:
data = dd.read_csv('data/transactions_train.csv', dtype={'article_id': str, 'customer_id': str})[["customer_id", "article_id", "t_dat"]]
data = data[data.t_dat >= '2020-09-15'].compute()
data["label"] = 1.0
data = data.drop_duplicates(subset=["customer_id", "article_id"])
data.drop(columns=["t_dat"], inplace=True)
article_id_list = list(articles.article_id.values)
data = data[data.article_id.isin(article_id_list)]

In [19]:
group_info = data.groupby(["customer_id"]).agg({"article_id": lambda x: list(set(x))}).reset_index()
group_info["prod_num"] = group_info.article_id.apply(lambda x: len(x))
customer_id_list = data.customer_id.to_list()
prod_id_list = data.article_id.to_list()
label_list = data.label.to_list()
del data

In [20]:
for i, row in group_info.iterrows():
    temp_articles = random.sample(article_id_list, 300)
    step = 0
    for id in temp_articles:
        if id not in row.article_id:
            customer_id_list.append(row.customer_id)
            label_list.append(0.0)
            prod_id_list.append(id)
            step += 1
        if step >= 250:
            break
    print('\r' + f'{i}: %{round(100*i/group_info.shape[0], 2)}', end='')

data = pd.DataFrame({"customer_id": customer_id_list, "article_id": prod_id_list, "label": label_list, "week": last_week+1})

22255: %31.12

In [None]:
data = dd.from_pandas(data, npartitions=4)
data = data.merge(articles.rename(columns={"age_bin":"prod_age_bin", "gender_group":"prod_gender_group", "rebuy_count":"prod_rebuy_count","price":"prod_avg_price"}), on="article_id", how="inner")
data = data.merge(customers.rename(columns={"age_bin":"customer_age_bin", "gender_group":"customer_gender_group", "rebuy_count":"customer_rebuy_count","price":"customer_avg_price"}), on="customer_id", how="inner")
data["same_prod_rebuy_count"] = data.apply(lambda x: [i[1] for i in x.history].count(x.article_id), axis=1, meta=("same_prod_rebuy_count","int"))
del articles, customers

In [None]:
def get_last_purchased_time(x):
    week_passed = 7*4
    if x.same_prod_rebuy_count != 0:
        for i in x.history:
            if i[1] == x.article_id:
                week_passed = i[0]
        return x.week - week_passed
    else:
        return week_passed

def get_avg_purchase_time(x):
    if x.same_prod_rebuy_count != 0:
        week = [x.week]
        for i in x.history:
            if i[1] == x.article_id:
                week.append(i[0])
        return (max(week) - min(week)) / (len(week) - 1)
    return 0

data["time_passed_last_purchase"] = data.apply(lambda x: get_last_purchased_time(x), axis=1,  meta=("time_passed_last_purchase","int"))
data["avg_purchase_time"] = data.apply(lambda x: get_avg_purchase_time(x), axis=1,  meta=("time_passed_last_purchase","float"))
data.drop(columns=["history", "week"], inplace=True)
data.same_prod_rebuy_count = (data.same_prod_rebuy_count - data.same_prod_rebuy_count.min()) / (data.same_prod_rebuy_count.max() - data.same_prod_rebuy_count.min())

In [None]:
data.to_csv(path + "ensemble_data.csv")