# Processing Data

## Import Libs

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## Deepmatcher Format

In [9]:
def csv_to_deepmatcher_pattern(path):
    df = pd.read_csv(path, low_memory=False)
    df = df.drop(['match', 'Unnamed: 0'], axis=1)
    df.drop_duplicates(subset=df.columns, inplace=True)
    df.dropna(inplace=True)
    return process_data(df)

In [10]:
def process_data(df):
    sku_array = df['sku'].values
    name_array = df['name'].values
    brand_array = df['brand/name'].values
    price_array = df['price'].values
    description_array = df['description'].values
    
    indexes = np.triu_indices(len(sku_array), k=1)
    
    sku_comparison = np.where(sku_array[indexes[0]] == sku_array[indexes[1]], 'Match', 'Non-Match')
    name_comparison = np.where(name_array[indexes[0]] == name_array[indexes[1]], 'Match', 'Non-Match')
    
    comparison = np.where(np.logical_or(sku_comparison == 'Match', name_comparison == 'Match'), 'Match', 'Non-Match')
    
    result_df = pd.DataFrame({'label': comparison,
                            'left_product': name_array[indexes[0]],
                            'left_manufacturer': brand_array[indexes[0]],
                            'left_price': price_array[indexes[0]],
                            'left_description': description_array[indexes[0]],
                            'right_product': name_array[indexes[1]],
                            'right_manufacturer': brand_array[indexes[1]],
                            'right_price': price_array[indexes[1]],
                            'right_description': description_array[indexes[1]]})
    return result_df

In [20]:
df_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/second data pre-processing/products_deepmatcher.csv')
print(df_deepmatcher['label'].value_counts())
mapping = {'Non-Match': 0, 'Match': 1}
df_deepmatcher['label'] = df_deepmatcher['label'].map(mapping)

Non-Match    45433570
Match             708
Name: label, dtype: int64


## Undersampling/Save Separated Datas

In [21]:
# Undersampling
df_majority = df_deepmatcher[df_deepmatcher['label'] == 0]
df_minority = df_deepmatcher[df_deepmatcher['label'] == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=1000, random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority])

X = df_balanced.drop("label", axis=1)
y = df_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

df_train = pd.concat([y_train, X_train], axis=1)
train_split = 0.8
train_size = int(len(df_train) * train_split)

train = df_train[:train_size]
valid = df_train[train_size:]
test = pd.concat([y_test, X_test], axis=1)

train.to_csv("../data/processed/undersampling/train.csv", index=False)
valid.to_csv("../data/processed/undersampling/valid.csv", index=False)
test.to_csv("../data/processed/undersampling/test.csv", index=False)