In [28]:
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
from dask import dataframe as dd
pd.options.mode.chained_assignment = None

## Reading and preparing the data

In [29]:
transactions = dd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
articles = dd.read_csv('data/articles.csv', dtype={'article_id': str})[["article_id", "section_name"]]
customers = dd.read_csv('data/customers.csv', dtype={'customer_id': str})[["customer_id", "age"]]
submission = pd.read_csv("data/sample_submission.csv")

In [30]:
# The best data for starting is last 3 week of data.
transactions = transactions.drop_duplicates(["customer_id", "article_id"])
transactions = transactions[transactions["t_dat"] >= '2019-09-24']
transactions = transactions.drop_duplicates(["article_id", "customer_id"])

In [31]:
def gender_classification(age):
    if age < 19:
        return 0
    elif age < 29:
        return 1
    elif age < 49:
        return 2
    elif age < 59:
        return 3
    elif age < 69:
        return 4
    else:
        return 5

customers["age-bins"] = customers.age.fillna(np.mean(customers.age))
customers.age = customers.age.map(gender_classification)

In [32]:
def gender_classification(section_name):
    if "womens" in section_name or "girl" in section_name or "ladies" in section_name:
        return "woman"
    elif "men" in section_name or "boy" in section_name or "boys" in section_name:
        return "man"
    else:
        return "other"

articles.section_name = articles.section_name.map(lambda x: x.lower())
articles["gender_group"] = articles.section_name.apply(gender_classification, meta=("gender", "O"))

In [33]:
# Run time nearly 2m
transactions = transactions.merge(articles[["article_id", "gender_group"]], on="article_id", how="inner")
transactions = transactions.merge(customers[["customer_id","age"]], on="customer_id", how="inner")
transactions = transactions.compute()
customers = customers.compute()
del articles

In [34]:
# Run time nearly 2m
customer_hist = transactions.groupby(by="customer_id").agg({"article_id": lambda x: list(x.values), "gender_group": lambda x : x.mode().iloc[0]}).reset_index()
customers = customers.merge(customer_hist, on="customer_id", how="left")
customers.article_id = customers.article_id.fillna("").apply(list)
customers.gender_group = customers.gender_group.fillna("other")
del customer_hist

In [35]:
transactions = transactions.drop(columns=["age", "gender_group"])
transactions = transactions.merge(customers[["customer_id", "age", "gender_group"]], on="customer_id", how="inner")
transactions["score"] = 1.0
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,age,gender_group,score
0,2019-09-24,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,692155003,0.084729,2,5,woman,1.0
1,2019-10-25,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,731702001,0.012305,2,5,woman,1.0
2,2019-09-28,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,788225001,0.040661,2,5,woman,1.0
3,2019-10-25,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,816503001,0.04622,2,5,woman,1.0
4,2019-10-25,0005f3aab821000881d74b72fde2d9b3e4742cf8613668...,701057007,0.005847,2,5,woman,1.0


In [36]:
transactions.t_dat = pd.to_datetime(transactions.t_dat) - timedelta(2)
transactions["week"] = transactions.t_dat.dt.isocalendar().week
transactions.week = transactions.week.apply(lambda w: (w + 15) % 54)

In [75]:
weekly_score = transactions.groupby(["article_id", "age", "week"]).agg({"score":"sum"}).reset_index()
lw_score = weekly_score[weekly_score.week == 53][["article_id","score"]]
lw_score.rename(columns = {'score':'last_score'}, inplace=True)

weekly_score = weekly_score.merge(lw_score, on="article_id", how="inner")
weekly_score["quotient"] = weekly_score.last_score/weekly_score.score

In [80]:
age_score = weekly_score.groupby(['article_id',"age"])['quotient'].sum().reset_index()
age_score = age_score.sort_values(['age', 'quotient'], ascending=False)

## Model

In [None]:
most_solds_last3 = transactions[transactions.week.isin([53,52,51])].groupby(["article_id", "age"]).agg({"score":"sum"})\
                                .rename(columns={"score":"demographic_score"}).reset_index()\
                                .sort_values('demographic_score', ascending=False)



most_solds.head()

## Sample Submission

In [None]:
most_sold_dict = {}
most_solds = weekly_score
age_group = most_solds.age.unique()

for i in age_group:
    most_sold_50 = most_solds[most_solds.age ==i].article_id.head(50).values
    most_sold_dict[f"{i}"] =  most_sold_50

In [None]:
def model_predict(row):
    products = []
    for i in most_sold_dict[f"{row.age}"]:
        if i not in row.article_id:
            products.append(i)
    return products

customers["prediction"] = customers.apply(model_predict, axis=1)
customers.prediction = customers.prediction.apply(lambda x: " ".join(x))

In [None]:
customers[["customer_id", "prediction"]].to_csv("submission.csv", index=False)