# General

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
df_clients = pd.read_csv('clients.csv')
df_materials = pd.read_csv('materials.csv')
df_plants = pd.read_csv('plants.csv')
df_transactions = pd.read_parquet('transactions.parquet', engine='pyarrow')

# Similar customers

In [3]:
# Take small part of transactions by specific plant
# in order to minimize CPU usage
# you can take all transactions data, by uncommenting next line and commenting others

# df_transactions_part = df_transactions

plant_id = '95b09698fda1f64af16708ffb859eab9'
df_transactions_part = df_transactions[df_transactions['plant'] == plant_id]

In [4]:
# Get all clients from out transactions data

indx2client = np.unique(df_transactions_part.client_id)
client2indx = {}
for i, client_id in enumerate(indx2client):
    client2indx[client_id] = i

In [5]:
# Get all materials from out transactions data

indx2material = np.unique(df_transactions_part.material)
material2indx = {}
for i, material in enumerate(indx2material):
    material2indx[material] = i

In [6]:
# Get information about how many times each person have bought each material

transactions_stat = df_transactions_part.groupby(['client_id', 'material']).sum()['sales_count'].reset_index()

In [8]:
# Create clients purchases statistic
# It shows how many times each person have bought each material, but in comfortable martix with zeros

clients_purchases_stat = np.zeros((len(indx2client), len(indx2material)), dtype=np.float)

for _, row in tqdm(transactions_stat.iterrows(), total=len(transactions_stat)):
    clients_purchases_stat[client2indx[row['client_id']]][material2indx[row['material']]] = row['sales_count']

HBox(children=(FloatProgress(value=0.0, max=275680.0), HTML(value='')))




In [9]:
# Create behavior vectors for each person
# It is purchases statistic, but it can we something else

clients_behavior = clients_purchases_stat

In [10]:
# Find pairwise distances between clients
# It is euclidian distances, but it can be something else

from sklearn.metrics.pairwise import euclidean_distances
distances = euclidean_distances(clients_behavior)

For each client we find $N$ similar people. After that we look at their purchases statistic and find the most "interesting" materials.  
  
Let's define:  
$purchases^*$ - number of purchases that material by the given client.  
$purchases_i$ - number of purchases that material by the $i$-th similar client.  
$S = \sum_i{(purchases_i - purchases^*)}$  
$P = (\prod_i{(purchases_i + 1)})^{\frac{1}{N}}$  
So out function is:  
$$f = \frac{SP}{purchases^*}$$
  
Explanation:  
$S$ helps us get only materials that similar people buy more.  
$P$ helps us get only materials that buy the most number of similar people, not just one of them.  
And dividing by $purchases^*$ helps us get moslty materials that given client almost doesn't buy or doesn't buy at all.

But insted of function $f$ can be any function you like more.

In [61]:
# funcion to finding most similar client to given
def find_nearest(indx, num=10):
    # print(np.sort(distances[indx])[1:num + 1])
    return np.argsort(distances[indx])[1:num + 1]

# function to find the most interesting materials with the following described above
def find_interesting_materials(nearest_clients_stat, client_stat, num=10):
    return np.flip(np.argsort(
        np.sum(nearest_clients_stat - client_stat, axis=0) * \
        np.product(nearest_clients_stat + 1, axis=0)**(1/nearest_clients_stat.shape[0]) / \
        (client_stat + 1)
    )[-num:])

# Function to find material to recommend to our client
# print_for_material prints stats of purchasing that product for each clients (provided and similar to him/her)
def find_recommendations(client_id, num=10, nearest_num=20, print_for_material=None):
    assert print_for_material is None or print_for_material < num
    indx = client2indx[client_id]
    nearest_indxs = find_nearest(indx, num=nearest_num)
    nearest_clients_stat = clients_purchases_stat[nearest_indxs]
    client_stat = clients_purchases_stat[indx]
    interesting_materials = find_interesting_materials(nearest_clients_stat, client_stat, num=num)
    if print_for_material is not None:
        print(f'Client stat of purchasing for the material: {client_stat[interesting_materials[print_for_material]]}')
        print('Stat of purchasing of similar clients for the material:')
        print(nearest_clients_stat[:, interesting_materials[print_for_material]])
    return [indx2material[m] for m in interesting_materials]

In [56]:
# Example of our results for some customer

client_id = 'f047965d9d09bc3ed6eed75de5b83c0a'
print('Recommended materials:', find_recommendations(client_id, num=3))

Recommended materials: ['a9e11060ef4cb84b377898ecd0f17fd4', 'b70ddcb63e68237dd48abee0b842d6d0', '1cbaaeb2212bbbe9d458ca9b81d74db7']


In [62]:
# Example for some customer of purchasing the second most recommended material
# by him/her and by similar people

_ = find_recommendations(client_id, num=3, print_for_material=1)

Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[ 8.04  13.1    0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.    11.95   7.516  0.    11.43   0.     0.   ]


In [63]:
# And example for the 10 most recommended materials

for i in range(10):
    print(f'Informations for the {i+1} most recommended material')
    _ = find_recommendations(client_id, num=10, nearest_num=5, print_for_material=i)
    print('\n')

Informations for the 1 most recommended material
Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[ 3.414  3.008  0.    22.216  9.398]


Informations for the 2 most recommended material
Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[10.  0. 18.  0.  0.]


Informations for the 3 most recommended material
Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[ 0.  0. 16.  0. 10.]


Informations for the 4 most recommended material
Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[9.216 0.    0.    6.832 4.398]


Informations for the 5 most recommended material
Client stat of purchasing for the material: 0.0
Stat of purchasing of similar clients for the material:
[22.  0.  1.  0.  1.]


Informations for the 6 most recommended material
Client stat of purchasing for the materia