In [98]:
import json
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from dask import dataframe as dd

In [99]:
customers = dd.read_csv('data/customers.csv')
transactions = dd.read_csv('data/transactions_train.csv')
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[(df._merge == 'both') &( df.prod_number > 1)][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
print("Len of data: ", len(transactions))
del customer_purchase_number, transactions, df

In [100]:
customers.FN = customers.FN.fillna(0)
customers.Active = customers.Active.fillna(0)
customers.age = customers.age.fillna(customers.age.mean())
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna("not_regular")
customers.fashion_news_frequency = customers.fashion_news_frequency.apply(lambda x: "not_regular" if x == "NONE" or x == "None" else x, meta=('fashion_news_frequency', 'object'))
prod_count = train.groupby("customer_id").agg({"customer_id":"count"}).rename(columns={"customer_id":"number_of_product"}).reset_index()
customers = customers.merge(prod_count, on="customer_id", how="inner")
prod_price = train.groupby("customer_id").agg({"price":["mean", "std"]}).rename(columns={"customer_id":"price_"}).reset_index()
prod_price.columns = list(map(''.join, prod_price.columns.values))
customers = customers.merge(prod_price, on="customer_id", how="inner")
customers = customers.drop(columns="postal_code").compute()
del train

In [101]:
def map_fashion_news(name):
    return 1 if name == 'not_regular' else 0
def map_club_member(name):
    return 1 if name == 'ACTIVE' else 0

customers.fashion_news_frequency = customers.fashion_news_frequency.map(map_fashion_news)
customers.club_member_status = customers.club_member_status.map(map_club_member)

In [102]:
def prepare_data(transactions):
    transactions = transactions.groupby(["customer_id"])['article_id']\
                                .apply(lambda x: list(x), meta=("article_ids",object))\
                                .reset_index().compute().drop_duplicates(subset=["customer_id"])

    transactions["hist_len"] = transactions.article_ids.apply(lambda x: 12 if len(x)>12 else len(x))
    transactions["prod_ids"] = transactions.apply(lambda x: random.sample(x.article_ids, x.hist_len), axis=1)
    transactions = transactions[["customer_id", "prod_ids"]]
    return transactions

data = prepare_data(test)
data = data.merge(customers, on="customer_id", how="inner")
data.to_pickle("data/ensemble_data.pkl")
print(data.shape)

(806879, 10)


In [103]:
class Loss:
    def __init__(self, lr, prod_size = 12):
        self.lr = lr
        self.prod_size = prod_size

    def top_k_score(self, predics, actual):
        scores = []
        actual = actual[:self.prod_size]
        for i,pred in enumerate(predics):
            hit = actual.count(pred)
            if hit > 0:
                scores.append(hit / (i+1.0))
        
        if len(scores) > 0:
            score = np.sum(scores) / len(set(actual))
            return score if score < 0 else 1
        return 0.00001


    def lr_schedular(self, epoch):
        if epoch >= 5:
            self.optimizer.lr = 0.001

    def __call__(self, predict, actual):
        return tf.math.log(-self.top_k_score(predict, actual))

In [104]:
customers = data.customer_id
train_y = data.prod_ids
train_x = tf.data.Dataset.from_tensor_slices(data.drop(columns=["customer_id", "prod_ids"]).values).batch(32)

2022-04-12 16:43:27.327005: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [109]:
class Ensemble(tf.keras.Model):
    def __init__(self, input_dim, dropout_rate = 0.1):
        super(Ensemble, self).__init__()
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Dense(units = 50, activation= 'tanh', input_dim = input_dim))
        self.model.add(tf.keras.layers.Dropout(dropout_rate))
        self.model.add(tf.keras.layers.Dense(units = 50, activation= 'tanh'))
        self.model.add(tf.keras.layers.BatchNormalization())
        self.model.add(tf.keras.layers.Dropout(dropout_rate))
        self.model.add(tf.keras.layers.Dense(units = 10, activation= 'tanh'))
        self.model.add(tf.keras.layers.BatchNormalization())
        self.model.add(tf.keras.layers.Dense(units = 3, activation= 'sigmoid'))

    def call(self, inputs):
        return self.model(inputs)