In [1]:
import pickle
import numpy as np
import pandas as pd
from dask import dataframe as dd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
transactions = dd.read_csv('data/transactions_train.csv')[["customer_id", "article_id"]]
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

c_ids = transactions.customer_id.unique()
number_of_customer = len(c_ids)
customer_encoding = {c_id: i for i, c_id in enumerate(c_ids)}
p_ids = transactions.article_id.unique()
number_of_products = len(p_ids)
product_encoding = {p_id: i for i, p_id in enumerate(p_ids)}
transactions.customer_id = transactions.customer_id.map(customer_encoding)
transactions.article_id = transactions.article_id.map(product_encoding)
p_ids = list(product_encoding.values())
del c_ids, product_encoding, customer_encoding, customer_purchase_number

In [3]:
train, test = transactions.random_split([0.9, 0.1], random_state=43)
df = test.merge(train[["customer_id"]], on=["customer_id"], how="outer", indicator=True)
train = dd.concat([train, df[(df._merge == 'left_only') |  (df.prod_number == 1)][["customer_id", "article_id"]]], axis=0, ignore_index=True, interleave_partitions=True, ignore_order=True)
df = df[(df._merge == 'both') &( df.prod_number > 1)][["customer_id"]].drop_duplicates()
test = test.merge(df, how="inner", on="customer_id")
train = train.drop(["prod_number"], axis=1)
test = test.drop(["prod_number"], axis=1)
cols = ["customer_id", "article_id"]
train[cols] = train[cols].applymap(np.int32)
test[cols] = test[cols].applymap(np.int32)
train = train.compute()
test = test.compute()
del transactions, df

In [4]:
data = train.groupby(["customer_id"]).apply(lambda x: ' '.join(map(lambda i: str(i), x.article_id.values)))
del train, test

In [5]:
vectorizer = TfidfVectorizer(use_idf=False)
score_matrix = vectorizer.fit_transform(data.values)
del data

In [11]:
model = NMF( n_components=128,
             init='random',
             random_state=43,
             max_iter=500
            )
W = model.fit_transform(score_matrix)
H = model.components_

In [16]:
with open('model_data/nmf_model_W.pkl', 'wb') as f:
    pickle.dump(W,f)

with open('model_data/nmf_model_H.pkl', 'wb') as f:
    pickle.dump(H,f)

In [17]:
H.shape

(128, 104071)