In [None]:
import random
import tensorflow as tf
from dask import dataframe as dd

In [None]:
customers = dd.read_csv('data/customers.csv')
transactions = dd.read_csv('data/transactions_train.csv')
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[(df._merge == 'both') &( df.prod_number > 1)][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
print("Len of data: ", len(transactions))
del transactions, customer_purchase_number

In [None]:
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.age = customers.age.fillna(customers.age.mean())
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x, meta=('fashion_news_frequency', 'object'))
prod_count = train.groupby("customer_id").agg({"customer_id":"count"}).rename(columns={"customer_id":"number_of_product"}).reset_index()
customers = customers.merge(prod_count, on="customer_id", how="inner")
prod_price = train.groupby("customer_id").agg({"price":["mean", "std"]}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
customers = customers.drop(columns="postal_code").compute()

In [None]:
def map_fashion_news(name):
    return 1 if name == 'not_regular' else 0
def map_club_member(name):
    return 1 if name == 'ACTIVE' else 0

customers.fashion_news_frequency = customers.fashion_news_frequency.map(map_fashion_news)
customers.club_member_status = customers.club_member_status.map(map_club_member)

In [None]:
def prepare_data(transactions):
    transactions = transactions.groupby(["customer_id"])['article_id']\
                                .apply(lambda x: list(x), meta=("article_ids",object))\
                                .reset_index().compute().drop_duplicates(subset=["customer_id"])

    transactions["hist_len"] = transactions.article_ids.apply(lambda x: 12 if len(x)>128 else len(x))
    transactions["prod_ids"] = transactions.apply(lambda x: random.sample(x.article_ids, x.hist_len), axis=1)
    transactions = transactions[["customer_id", "prod_ids"]]
    return transactions

train = prepare_data(train)
test = prepare_data(test)
train.to_pickle('data/train.csv')
test.to_pickle('data/test.csv')
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
customers.head()