In [40]:
import pandas as pd
from igraph import Graph, plot
import numpy as np
from tqdm import tqdm

In [211]:
products = pd.read_csv('../data/pareto_training.csv')
matches = pd.read_csv('matches.csv')

In [217]:
matches['match'] = matches.apply(lambda x: 1 if x.prob > 0.5 else 0, axis=1)

In [218]:
len(np.unique(np.append(matches[matches.match == 1].id1.values, matches[matches.match == 1].id2.values)))

2396

## Dedupe

In [177]:
from dedupe.clustering import cluster
# list(cluster(np.array([((0,1), 0.6), ([1,2], 0.6)], dtype=[('pairs', 'int', 2), ('score', 'f4')])))

In [178]:
clusters = list(
    cluster(np.array(matches.apply(lambda x: ((x.id1, x.id2), x.prob), axis=1), dtype=[('pairs', 'int', 2), ('score', 'f4')])
           ,threshold=0.35
           )
)

In [93]:
def log_clusters(clusters):
    for c in clusters:
        names = [products[products.id == p].iloc[0]['name'] for p in c[0]]
        print(names)

In [136]:
len([p for c in clusters for p in c[0]])

1158

In [213]:
log_clusters(clusters)

## IGraph

In [230]:
g = Graph()

In [231]:
g.add_vertices(len(products))

In [222]:
id_mapping = dict(zip(products.id, range(len(products))))

In [89]:
def dims_equal(id1, id2, dim):
    dim1 = products[products.id == id1][dim].iloc[0]
    dim2 = products[products.id == id2][dim].iloc[0]
    if dim1 != np.nan and dim2 != np.nan:
        return dim1 == dim2
    else:
        return False

In [155]:
edges = [i for i, match in tqdm(matches.iterrows()) if match.match == 1 
             and dims_equal(match.id1, match.id2, 'quantity')
             and (dims_equal(match.id1, match.id2, 'volume') or dims_equal(match.id1, match.id2, 'weight'))
            ]

In [232]:
g.add_edges([(id_mapping[match.id1], id_mapping[match.id2]) for i, match in matches.iterrows() if match.prob > 0.5])

In [234]:
g.vs['id'] = products.id.values
g.vs['name'] = products.id.name

In [207]:
g.vs[0].attributes()

{'id': 541356045.0, 'name': 'id'}

In [241]:
clusters = []
for i, c in enumerate(g.clusters()):
    if len(c) > 2:
        for p in c:
            clusters.append({
                "id": g.vs[p].attributes()['id'],
                "cluster": i
        })

clusters = pd.DataFrame(clusters)

In [242]:
clusters

Unnamed: 0,id,cluster
0,1.653551e+08,18
1,9.968536e+08,18
2,3.280139e+08,18
3,2.675180e+08,26
4,1.618646e+09,26
...,...,...
1959,8.177824e+08,3503
1960,2.987088e+08,3503
1961,3.160876e+08,3503
1962,2.988256e+08,3503


In [243]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(clusters.merge(products, on='id', how='left')[['name', 'cluster']])

                                                   name  cluster
0           (GOJEK) Nutrinidrink Vanilla 200ml 1 Karton       18
1        NUTRINIDRINK cair VANILA 200 ml - DIJAMIN ASLI       18
2                            NUTRINIDRINK VANILA 200 ML       18
3                             ANLENE GOLD VANILA 650 GR       26
4                 ANLENE GOLD 650GR - Coklat, Dus Bagus       26
5               Anlene Gold 650gr All Variant - cokelat       26
6                             ANLENE GOLD COKLAT 650 GR       26
7                    ANLENE GOLD COKLAT 650 GR - COKLAT       26
8                           ANLENE GOLD ORIGINAL 650 GR       26
9                             anlene gold original 650g       26
10                   Anlene Gold Plus 650 Gr - Original       26
11                Anlene Gold Plus 650 gr 650gr Vanilla       26
12                                ANLENE GOLD Plus 650g       26
13                    Anlene Gold Plus 650gr - Original       26
14    Anlene Gold Plus 65