# Clients clustering algorithm
# Disclaimer: This is not the notebook, where we did all the steps. We worked with smaller tasks in different environments, also we had to do 'save-restart kernel-load' very frequently as it consumes a lot of memory.
# We just tried to reproduce the whole pipeline here
# If you intersted in some of our files (customer embeddings or clusters labels) or mode details please contact us


# Pipeline description:
# 1. Build customer buckets (list of favourite products in each category (lowest level of hierarchy)
# 2. Use word2vec to obtain products embeddings (texts = customer buckets, words = products)
# 3. Use tf-idf weighting of products embeddings to get customers (buckets) embeddings
# 4. Use agglomerative clustering with cosine measure over customer embeddings to obtain customer clusters

# Data preprocessing

In [None]:
import pandas as pd
import numpy as np
import pickle
from  tqdm import tqdm

from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import AgglomerativeClustering

In [None]:
clients = pd.read_csv('clients.csv')
plants = pd.read_csv('plants.csv')
materials = pd.read_csv('materials.csv')
transactions = pd.read_parquet('transactions.parquet')

### Drop huge hash labels and replace them with indexes (3 times less storage consumed)

In [None]:
transactions = (transactions.merge(clients[['client_id']]
                                   .reset_index(), how='left')
                .drop(columns=['client_id'])
                .rename(columns={'index':'client_id'}))
transactions = (transactions.merge(plants[['plant']]
                                   .reset_index(), how='left')
                .drop(columns=['plant'])
                .rename(columns={'index':'plant'}))
transactions = (transactions.merge(materials[['material']]
                                   .reset_index(), how='left')
                .drop(columns=['material'])
                .rename(columns={'index':'material'}))
transactions['client_id'] = transactions['client_id'].fillna(-1).astype('int')
transactions['material'] = transactions['material'].fillna(-1).astype('int')

clients = (clients.reset_index().drop(columns='client_id')
           .rename(columns={'index':'client_id'}))
plants = (plants.reset_index().drop(columns=['plant'])
          .rename(columns={'index':'plant'}))
materials = (materials.reset_index().drop(columns=['material'])
             .rename(columns={'index':'material'}))

materials_hl2 = materials['hier_level_2'].unique()
materials_hl2 = dict(zip(materials_hl2, range(materials_hl2.shape[0])))
materials['hier_level_2'] = materials['hier_level_2'].map(materials_hl2)

materials_hl3 = materials['hier_level_3'].unique()
materials_hl3 = dict(zip(materials_hl3, range(materials_hl3.shape[0])))
materials['hier_level_3'] = materials['hier_level_3'].map(materials_hl3)

materials_hl4 = materials['hier_level_4'].unique()
materials_hl4 = dict(zip(materials_hl4, range(materials_hl4.shape[0])))
materials['hier_level_4'] = materials['hier_level_4'].map(materials_hl4)

materials_vendor = materials['vendor'].unique()
materials_vendor = dict(zip(materials_vendor, range(materials_vendor.shape[0])))
materials['vendor'] = materials['vendor'].map(materials_vendor)

transactions_check_ids = transactions['chq_id'].unique()
transactions_check_ids = dict(zip(transactions_check_ids, range(transactions_check_ids.shape[0])))
transactions['chq_id'] = transactions['chq_id'].map(transactions_check_ids)

### Add total cheqs per client and total cheqs per plant features

In [None]:
clients = (clients.merge(transactions[['client_id', 'chq_id']]
                         .drop_duplicates()
                         .groupby('client_id')[['chq_id']]
                         .count()
                         .rename(columns={'chq_id':'total_chqs_client'}).reset_index()))
plants = (plants.merge(transactions[['plant', 'chq_id']]
                       .drop_duplicates()
                       .groupby('plant')[['chq_id']]
                       .count()
                       .rename(columns={'chq_id':'total_chqs_plant'}).reset_index()))

### Filter loyal clients (1 to 3 cheqs per week)

In [None]:
filtered = (transactions[['plant', 'client_id', 'chq_id']].merge(plants[['plant', 'total_chqs_plant']])
                                                          .merge(clients[['client_id', 'total_chqs_client']]))
filtered = filtered[(filtered['total_chqs_client']>=50)
                    &
                    (filtered['total_chqs_client']<=150)]

transactions = transactions.merge(filtered[['client_id']].drop_duplicates(), how='inner')
del filtered

# Building client buskets
# The bucket is list of the most purchasable products for each category (1 product per category) (category = lowest level of hierarchy (we can potentially build the new analytical one))

In [None]:
def collect_client_basket(transactions, materials):
    
    tr1 = transactions[['client_id', 'material', 'sales_count']].groupby(['client_id', 'material']).sum().reset_index()
    
    unq_hier = tr1[['material']].drop_duplicates().merge(materials[['material', 'hier_level_4']])['hier_level_4'].unique()
    dict_unq_hier = dict(zip(unq_hier, np.arange(len(unq_hier))))
    
    tr2 = pd.merge(tr1, materials[['material', 'hier_level_4']]).sort_values(['client_id', 'hier_level_4', 'sales_count'], ascending=False)

    tr2.drop_duplicates(['client_id', 'hier_level_4'], inplace=True)
    
    dict_client_top_materials = {}
    unq_client = tr2.client_id.unique()
    
    for client in tqdm(unq_client):
        
        tr3 = tr2[tr2.client_id == client]
        lst = [-1] * len(unq_hier)
        
        for ind in range(tr3.shape[0]):
            
            sub_ind = dict_unq_hier[tr3.iloc[ind].hier_level_4]            
            lst[sub_ind] = int(tr3.iloc[ind].material)

        dict_client_top_materials[client] = lst
        
    return dict_client_top_materials

buskets = collect_client_basket(transactions, materials)


with open('buskets.pickle', 'wb') as handle:
    pickle.dump(buskets, handle)

# Representing the baskets as the collection of texts
# Recieved the list of ints(materials ids) for each customer, want to get a "text"(string)

In [None]:
texts = [[str(material) for material in client] for client in buskets.values()]
texts_joined = [' '.join(sentence) for sentence in sentences]

# Building material (word) embeddings
# Lots of parameters to work with)

In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        
    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [None]:
EMB_SIZE = 50
WINDOW = len(sentences[0])
ALGO = 1 # 1 for skip-gram, 1 for CBOW
SAMPLING = 0 # 1 for hierarchical softmax, 0 for negative sampling
NEGATIVE_SAMPLES = 5 # 5-20 for small datasets according to a papaer
CBOW_MEAN = 1 # if 0 use sum not mean
N_ITER = 5

epoch_logger = EpochLogger()
word2vec = Word2Vec(texts, size=EMB_SIZE, window=WINDOW, workers=8,
                 min_count=1, sg=ALGO, hs=SAMPLING, negative=NEGATIVE_SAMPLES,
                 max_vocab_size=None, cbow_mean=CBOW_MEAN, iter=N_ITER, callbacks=[epoch_logger])

### want to get the embeddings in the form of the dict

In [None]:
embeddings = {material: word2vec.wv.get_vector(material) for material in word2vec.wv.vocab.keys()}

# Building client (text) embeddings

### Calculate tf-idf weights first

In [None]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x.split())
texts_tfidf = vectorizer.fit_transform(texts_joined).toarray()

### Stack embeddings into the matrix

In [None]:
products_embedding_matrix = np.zeros((len(vectorizer.vocabulary_.keys()), EMB_SIZE))
for product, vocab_idx in tqdm(vectorizer.vocabulary_.items()):
    products_embedding_matrix[vocab_idx] = embeddings[product]

### Get the client embeddings table as a matrix product of tf-idf matrix and word(product) embeddings matrix

In [None]:
clients_embedding = sentences_tfidf @ products_embedding_matrix

# Clients clustering

In [None]:
clustering = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='complete', distance_threshold=0.2)
clusters_prediction = clustering.fit_predict(clients_embedding)