# Data Processing

In [3]:
import pandas as pd
import numpy as np

## Smartphones

In [1]:
def csv_to_deepmatcher_pattern(path):
    df = pd.read_csv(path, low_memory=False)
    df = df[['name', 'brand/name', 'description', 'sku', 'offers/price']]
    df = df.rename(columns={'offers/0/offers/0/price': 'offers/price'})
    df.drop_duplicates(subset=df.columns, inplace=True)
    df.dropna(inplace=True)
    display(df)
    return process_data(df)

In [4]:
def process_data(df):
    sku_array = df['sku'].values
    name_array = df['name'].values
    brand_array = df['brand/name'].values
    price_array = df['offers/price'].values
    description_array = df['description'].values
    
    indexes = np.triu_indices(len(sku_array), k=1)
    
    sku_comparison = np.where(sku_array[indexes[0]] == sku_array[indexes[1]], 'Match', 'Non-Match')
    name_comparison = np.where(name_array[indexes[0]] == name_array[indexes[1]], 'Match', 'Non-Match')
    
    comparison = np.where(np.logical_or(sku_comparison == 'Match', name_comparison == 'Match'), 'Match', 'Non-Match')
    
    result_df = pd.DataFrame({'Label': comparison,
                            'Left Product Title': name_array[indexes[0]],
                            'Left Manufacturer': brand_array[indexes[0]],
                            'Left Price': price_array[indexes[0]],
                            'Left Description': description_array[indexes[0]],
                            'Right Product Title': name_array[indexes[1]],
                            'Right Manufacturer': brand_array[indexes[1]],
                            'Right Price': price_array[indexes[1]],
                            'Right Description': description_array[indexes[1]]})
    return result_df

## Kabum - Smartphones

In [5]:
df_kabumSmartphones_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/KabumSmartphones.csv')
print(df_kabumSmartphones_deepmatcher['Label'].value_counts())

Unnamed: 0,name,brand/name,description,sku,offers/price
0,"Smartphone Motorola Moto E20, 32GB, 2GB RAM, O...",Motorola,"Smartphone Motorola Moto E20, 32GB, 2GB RAM, O...",265965,649.00
1,"Smartphone Samsung Galaxy A03 Core, 2GB RAM, 3...",Samsung,"Smartphone Samsung Galaxy A03 Core, 32GB, 2GB ...",378876,599.00
2,"Carregador de Tomada Apple USB-C, para iPhone ...",Apple,"Carregador de Tomada Apple USB-C, 20W, Branco ...",208371,149.00
3,"Smartphone Samsung Galaxy A13, 4GB RAM, 128GB,...",Samsung,"Smartphone Samsung Galaxy A13 128GB, Preto Con...",324820,1199.00
4,"Smartphone Motorola Moto G71, 5G, 6GB RAM, 128...",Motorola,"Smartphone Motorola Moto G71, 5G, 6GB RAM, 128...",308550,1749.00
...,...,...,...,...,...
10409,Pelicula Applewatch 44mm Invisivel,HPRIME,Pelicula AppleWatch 44mm Invisivel As Película...,366154,16.39
10410,Pelicula Iphone Xs Max/ 11 Pro Max Invisivel,HPRIME,Pelicula Iphone Xs Max/ 11 Pro Max InvisivelAs...,366155,16.39
10411,Pelicula Iphone 7/ 8/ Se2022 Invisivel,HPRIME,Pelicula Iphone 7/ 8/ SE2022 InvisivelAs Pelíc...,366157,16.39
10412,Pelicula Applewatch 42mm Invisivel,HPRIME,Pelicula AppleWatch 42mm InvisivelAs Películas...,366164,16.39


Non-Match    35527850
Match             385
Name: Label, dtype: int64


## Carrefour - Smartphones

In [36]:
df_carrefourSmartphone_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/CarrefourSmartphone.csv')
print(df_carrefourSmartphone_deepmatcher['Label'].value_counts())

KeyError: "['offers/price'] not in index"

## Girafa - Smartphones

In [6]:
df_girafa_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/GirafaSmartphone.csv')
print(df_girafa_deepmatcher['Label'].value_counts())

Non-Match    5253
Name: Label, dtype: int64


## Loja Solar - Smartphones

In [10]:
#Crawler feito de maneira errada. pois 184 é o total de itens
df_lojaSolar_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/LojaSolarSmartphone.csv')
print(df_lojaSolar_deepmatcher['Label'].value_counts())

Non-Match    40898
Match          143
Name: Label, dtype: int64


## Angeloni - Smarphones

In [12]:
df_angeloni_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/AngeloniSmartphone.csv')
print(df_angeloni_deepmatcher['Label'].value_counts())

Non-Match    30504
Match          124
Name: Label, dtype: int64


## Casas Bahia - Smartphones

In [33]:
df_casasBahiaIphone_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/CasasBahiaIphone.csv')
print(df_casasBahiaIphone_deepmatcher['Label'].value_counts())

df_casasBahiaAndroid_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/first data pre-processing/CasasBahiaAndroid.csv')
print(df_casasBahiaAndroid_deepmatcher['Label'].value_counts())

Non-Match    283058
Match            70
Name: Label, dtype: int64
Non-Match    4953108
Match            270
Name: Label, dtype: int64


In [None]:
#df_kabumSmartphones_deepmatcher.to_csv('../data/processed/KabumSmartphonesDeepmatcher.csv')