In [1]:
import h5py
import json
import dask
import warnings
import numpy as np
import pandas as pd
import multiprocessing
import dask.array as da
import tensorflow as tf
from dask import dataframe as dd
from scipy import sparse
from nltk.tokenize import word_tokenize
from dask_ml.wrappers import Incremental
from dask_ml.decomposition import TruncatedSVD
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore", category=RuntimeWarning)
train = True

In [2]:
if train:
    path = 'data/ensemble_train/'
    model_path = "d2v_train.model"
    results_path = "model_data/content/ensemble_train/"
else:
    path = 'data/ensemble/'
    model_path = "d2v.model"
    results_path = "model_data/content/ensemble/"

In [3]:
corpus = pd.read_csv(path+"corpus.csv")
indexes = corpus[corpus.type == "product"].index.tolist()
ids = corpus[corpus.type == "product"].doc_id.tolist()
id_dict = dict(zip(ids, indexes))
with open(results_path+'prod_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
indexes = corpus[corpus.type == "customer"].index.tolist()
ids = corpus[corpus.type == "customer"].doc_id.tolist()
id_dict = dict(zip(ids, indexes))
with open(results_path+'customer_id_dict.json', 'w') as fp:
    json.dump(id_dict, fp)
del corpus, indexes, ids, id_dict

## Content Based Doc2Vec

In [4]:
corpus = pd.read_csv(path+"corpus.csv", index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
Documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

In [5]:
# doc2vec
epochs = 25
cores = multiprocessing.cpu_count()

model= Doc2Vec(dm=0,
               vector_size=512,
               negative=5,
               hs=0,
               min_count=2,
               sample = 0,
               workers=cores)

model.build_vocab(Documents)
for epoch in range(epochs):
    print('iteration {0}'.format(epoch),end = "\r")
    model.train(Documents,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("weights/"+model_path)
print("Model Saved.")

Model Saved.


## Doc2Vec Results

In [None]:
model = Doc2Vec.load("weights/d2v.model")
model_vectors = model.dv.get_normed_vectors()
articles = pd.read_pickle(path+"articles.pkl")
articles.article_id = articles.article_id.astype(str)
customers = pd.read_pickle(path+"customers.pkl")
prod_ids = articles.article_id.tolist()
customer_ids = customers.customer_id.tolist()
del articles, customers

In [None]:
with open(results_path+"prod_id_dict.json","r") as f:
    prod_dict = json.load(f)
    prod_ids = [prod_dict[i] for i in prod_ids]
    prod_d2v_gpu = tf.nn.l2_normalize(tf.convert_to_tensor(model_vectors[prod_ids].toarray()))
    del prod_ids, prod_dict

with open(results_path+"customer_id_dict.json","r") as f:
    customer_dict = json.load(f)
    customer_ids = [customer_dict[i] for i in customer_ids]
    customer_d2v = model_vectors[customer_ids].toarray()
    del customer_ids, model_vectors, customer_dict

del model, model_vectors

In [None]:
batch = 512
step = 50000 // batch
f = h5py.File(results_path+'d2vf.h5', 'w', libver='latest')
dset = f.create_dataset("d2v", (customer_d2v.shape[0], prod_d2v_gpu.shape[0]), dtype=np.float64, compression='gzip')

ptr = 0
temp = []
for i, batch_i in enumerate(range(0, customer_d2v.shape[0], batch)):
    customer_batch = tf.convert_to_tensor(customer_d2v[batch_i:batch_i+batch])
    customer_batch = tf.nn.l2_normalize(customer_batch)
    batch_distances = tf.matmul(customer_batch, prod_d2v_gpu, transpose_b=True)
    for distance in batch_distances:
        temp.append(distance.numpy())

    if i != 0 and (i % step == 0 or i == (customer_d2v.shape[0] // batch)):
        temp = np.asarray(temp, dtype=np.float64)
        dset[ptr:ptr+temp.shape[0],:] = temp
        ptr = batch_i+batch
        temp = []
    print('\r' + f'{i*batch}: %{round(100*i*batch/customer_d2v.shape[0], 2)}', end='')
f.close()
del customer_d2v, prod_d2v_gpu

## Content based TfIdf results

In [3]:
# tf-idf word level implementation //its run about 11 minutes
corpus = pd.read_csv(path+'corpus.csv', index_col=False)["doc"].apply(lambda x: x.replace('.','')).to_list()
vectorizer = TfidfVectorizer(lowercase=True,max_features=8192, dtype=np.float32)
model_vectors = vectorizer.fit_transform(corpus)

del corpus

In [4]:
articles = pd.read_pickle(path+"articles.pkl")
articles.article_id = articles.article_id.astype(str)
customers = pd.read_pickle(path+"customers.pkl")
prod_ids = articles.article_id.tolist()
customer_ids = customers.customer_id.tolist()
del articles, customers

In [5]:
with open(results_path+"prod_id_dict.json","r") as f:
    prod_dict = json.load(f)
    prod_ids = [prod_dict[i] for i in prod_ids]
    prod_tf_gpu = tf.nn.l2_normalize(tf.convert_to_tensor(model_vectors[prod_ids].toarray()))
    del prod_ids, prod_dict

with open(results_path+"customer_id_dict.json","r") as f:
    customer_dict = json.load(f)
    customer_ids = [customer_dict[i] for i in customer_ids]
    customer_tf = model_vectors[customer_ids].toarray()
    del customer_ids, model_vectors, customer_dict

In [22]:
batch = 512
step = 50000 // batch
f.close()
f = h5py.File(results_path+'tf_idf.h5', 'w', libver='latest')
dset = f.create_dataset("tf_idf", (customer_tf.shape[0], prod_tf_gpu.shape[0]), dtype=np.float64, compression='gzip')

ptr = 0
temp = []
for i, batch_i in enumerate(range(0, customer_tf.shape[0], batch)):
    customer_batch = tf.convert_to_tensor(customer_tf[batch_i:batch_i+batch])
    customer_batch = tf.nn.l2_normalize(customer_batch)
    batch_distances = tf.matmul(customer_batch, prod_tf_gpu, transpose_b=True)
    for distance in batch_distances:
        temp.append(distance.numpy())

    if i != 0 and (i % step == 0 or i == (customer_tf.shape[0] // batch)):
        temp = np.asarray(temp, dtype=np.float64)
        dset[ptr:ptr+temp.shape[0],:] = temp
        ptr = batch_i+batch
        temp = []
    print('\r' + f'{i*batch}: %{round(100*i*batch/customer_tf.shape[0], 2)}', end='')
f.close()
del customer_tf, prod_tf_gpu

163840: %23.93

## Only Content Based Results

In [None]:
results = h5py.File('model_data/content/tf_idf.h5', 'r')["tf_idf"]
submission = pd.read_csv('data/sample_submission.csv', index_col=False)
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = json.load(f)
    customer_ids = {k:v-105542 for k,v in customer_ids.items()}
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = json.load(f)
    prod_ids = {v:k for k,v in prod_ids.items()}

In [None]:
predictions = []
results = np.asanyarray(results)
for i, row in submission.iterrows():
    if row.customer_id in customer_ids:
        prods = results[customer_ids[row.customer_id],0]
        predictions.append(' '.join([prod_ids[int(i)] for i in prods[:12]]))
    else:
        predictions.append(row.prediction)
    print('\r' + f'{i}: %{round(100*i/submission.shape[0], 2)}', end='')
    
submission["prediction"] = predictions
submission.to_csv('submission.csv', index=False)

In [None]:
results = h5py.File('model_data/content/d2v.h5', 'r')["d2v"]
submission = pd.read_csv('data/sample_submission.csv', index_col=False)
with open("model_data/customer_id_dict.json","r") as f:
    customer_ids = json.load(f)
    customer_ids = {k:v-105542 for k,v in customer_ids.items()}
with open("model_data/prod_id_dict.json","r") as f:
    prod_ids = json.load(f)
    prod_ids = {v:k for k,v in prod_ids.items()}

In [None]:
predictions = []
results = np.asanyarray(results)
for i, row in submission.iterrows():
    if row.customer_id in customer_ids:
        prods = results[customer_ids[row.customer_id],0]
        predictions.append(' '.join([prod_ids[int(i)] for i in prods[:12]]))
    else:
        predictions.append(row.prediction)
    print('\r' + f'{i}: %{round(100*i/submission.shape[0], 2)}', end='')
    
submission["prediction"] = predictions
submission.to_csv('submission.csv', index=False)