In [1]:
import pandas as pd
from igraph import Graph, plot
import numpy as np
from tqdm import tqdm

In [54]:
products = pd.read_csv('../data/pareto_training.csv')
matches = pd.read_csv('../data/matches.csv')

In [22]:
matches['match'] = matches.apply(lambda x: 1 if x.prob > 0.5 else 0, axis=1)

In [55]:
len(np.unique(np.append(matches[matches.match == 1].id1.values, matches[matches.match == 1].id2.values)))

1739

## Dedupe

In [177]:
from dedupe.clustering import cluster
# list(cluster(np.array([((0,1), 0.6), ([1,2], 0.6)], dtype=[('pairs', 'int', 2), ('score', 'f4')])))

In [178]:
clusters = list(
    cluster(np.array(matches.apply(lambda x: ((x.id1, x.id2), x.prob), axis=1), dtype=[('pairs', 'int', 2), ('score', 'f4')])
           ,threshold=0.35
           )
)

In [93]:
def log_clusters(clusters):
    for c in clusters:
        names = [products[products.id == p].iloc[0]['name'] for p in c[0]]
        print(names)

In [136]:
len([p for c in clusters for p in c[0]])

1158

In [213]:
log_clusters(clusters)

## IGraph

In [56]:
g = Graph()

In [57]:
g.add_vertices(len(products))

In [47]:
id_mapping = dict(zip(products.id, range(len(products))))

In [89]:
def dims_equal(id1, id2, dim):
    dim1 = products[products.id == id1][dim].iloc[0]
    dim2 = products[products.id == id2][dim].iloc[0]
    if dim1 != np.nan and dim2 != np.nan:
        return dim1 == dim2
    else:
        return False

In [155]:
edges = [i for i, match in tqdm(matches.iterrows()) if match.match == 1 
             and dims_equal(match.id1, match.id2, 'quantity')
             and (dims_equal(match.id1, match.id2, 'volume') or dims_equal(match.id1, match.id2, 'weight'))
            ]

In [58]:
g.add_edges([(id_mapping[match.id1], id_mapping[match.id2]) for i, match in matches.iterrows() if match.prob > 0.5])

In [59]:
g.vs['id'] = products.id.values
g.vs['name'] = products.id.name

In [207]:
g.vs[0].attributes()

{'id': 541356045.0, 'name': 'id'}

In [60]:
clusters = []
for i, c in enumerate(g.clusters()):
    if len(c) > 2:
        for p in c:
            clusters.append({
                "id": g.vs[p].attributes()['id'],
                "cluster": i
        })

clusters = pd.DataFrame(clusters)

In [63]:
clusters

Unnamed: 0,id,cluster
0,5.228522e+08,8
1,1.079962e+09,8
2,2.148352e+08,8
3,5.683591e+08,8
4,6.653844e+08,8
...,...,...
1150,2.109768e+08,4148
1151,2.109817e+08,4148
1152,1.812122e+09,4191
1153,1.225985e+09,4191


In [62]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(clusters.merge(products, on='id', how='left')[['name', 'cluster']])

                                                   name  cluster
0                          ANLENE ACTIFIT COKLAT 600 GR        8
1                         ENTRASOL GOLD CHOCOLATE 600gr        8
2                           Entrasol Gold Coklat 600 gr        8
3                            Entrasol Gold Coklat 600gr        8
4                            ENTRASOL GOLD COKLAT 600GR        8
5                          ANLENE ACTIFIT VANILA 600 GR       14
6                          Anlene Actifit vanila 600 gr       14
7                             ANLENE GOLD VANILA 600 GR       14
8                              Diabetasol Vanilla 180gr       14
9                              Entrakid Vanila 185 gram       14
10               Entramix coklat/vanilla 185gr - Vanila       14
11              Entramix susu nutrisi seimbang - Vanila       14
12                      ENTRAMIX Susu rasa Vanilla 185g       14
13                      ENTRAMIX Susu rasa Vanilla 185g       14
14       Entramix Vanila 