In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../3.feature_extraction/features.tsv', sep='\t')
data.set_index('ID', inplace=True)
data = data.where(pd.notnull(data), None)
data.head()

Unnamed: 0_level_0,TITLE,BRAND,MODEL,RAM,STORAGE,PLUS,COLOR,SCREEN_SIZE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7923488,Smartphone ASUS Zenfone II ZE551 16GB 4G Tela ...,asus,,,16.0,False,,
8105618,"Smartphone Asus Zenfone 2 ZE551ML, 4G Android ...",asus,zenfone 2,,16.0,False,gold,
10026591,Caminhão Super Bombeiro Resgate Som E Luz - Ma...,,,,,False,,
10027542,Transformers 4 Power Battlers Dinobot Strafe -...,,,,,False,,
10111398,Memoria 4gb Ddr3 1333 Kvr13n9s8/4 Kingston,,,,4.0,False,,


In [15]:
def either_absent_or_different(entry_1, entry_2, columns):
    for col in columns:
        col_e1 = entry_1[col]
        col_e2 = entry_2[col]
        if col_e1 is None or col_e2 is None:
            yield True
        else:
            yield col_e1 != col_e2


def both_present_and_same(entry_1, entry_2, columns):
    for col in columns:
        col_e1 = entry_1[col]
        col_e2 = entry_2[col]
        if col_e1 is None or col_e2 is None:
            yield False
        else:
            yield col_e1 == col_e2

            
def both_present_and_different(entry_1, entry_2, columns):
    for col in columns:
        col_e1 = entry_1[col]
        col_e2 = entry_2[col]
        if col_e1 is None or col_e2 is None:
            yield False
        else:
            yield col_e1 != col_e2

    
def null_distance(entry_1, entry_2):
    relevant_cols = ['BRAND', 'MODEL', 'RAM', 'STORAGE', 'SCREEN_SIZE']
    entry_1 = entry_1[relevant_cols]
    entry_2 = entry_2[relevant_cols]
    
    e1_nulls = entry_1.isnull()
    e2_nulls = entry_2.isnull()
    
    either_nulls = e1_nulls | e2_nulls
    
    return np.sum(either_nulls)
    
    
def match(entry_1, entry_2):
    spec_cols = ['RAM', 'STORAGE', 'SCREEN_SIZE']
    null_dist = null_distance(entry_1, entry_2)
    specs_in_common = both_present_and_same(entry_1, entry_2, spec_cols)
    
    if entry_1.PLUS != entry_2.PLUS:
        return False, null_dist
    elif any(either_absent_or_different(entry_1, entry_2, ['BRAND'])):
        return False, null_dist
    elif any(either_absent_or_different(entry_1, entry_2, ['MODEL'])):
        # Se o modelo não foi identificado, assumir que são o mesmo se e somente se
        # todos os specs existirem e forem os mesmos
        return all(specs_in_common), null_dist
    elif any(both_present_and_different(entry_1, entry_2, spec_cols)):
        return False, null_dist
    else:
        return True, null_dist


def match_ids(id_1, id_2):
    return match(data.loc[id_1], data.loc[id_2])

In [4]:
from itertools import combinations

In [48]:
from tqdm.autonotebook import tqdm


def find_matches():
    pairs = combinations(data.index, 2)
    n_items = len(data)
    n_pairs = (n_items * (n_items - 1)) // 2
    for pair in tqdm(pairs, total=n_pairs):
        pair_match, dist = match_ids(*pair)
        if pair_match:
            yield pair, dist

In [49]:
matches_iter = find_matches()

In [50]:
matches = pd.DataFrame(matches_iter, columns=['PAIR', 'DIST'])
matches.index = pd.MultiIndex.from_tuples(matches.PAIR, names=['ID1', 'ID2'])
matches.drop('PAIR', inplace=True, axis=1)
matches.head()

HBox(children=(IntProgress(value=0, max=123753), HTML(value='')))




Unnamed: 0_level_0,Unnamed: 1_level_0,DIST
ID1,ID2,Unnamed: 2_level_1
8105618,10443427,3
8105618,10443622,3
10223260,10392630,3
10223260,10392657,3
10223260,10392663,3


In [54]:
matches.to_csv('matches.tsv', sep='\t')