# Processing Data

## Import Libs

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## Deepmatcher Format

In [2]:
def csv_to_deepmatcher_pattern(path):
    df = pd.read_csv(path, low_memory=False)
    df = df.drop(['match', 'Unnamed: 0'], axis=1)
    df.drop_duplicates(subset=df.columns, inplace=True)
    df.dropna(inplace=True)
    return process_data(df)

In [3]:
def process_data(df):
    sku_array = df['sku'].values
    name_array = df['name'].values
    brand_array = df['brand/name'].values
    price_array = df['price'].values
    description_array = df['description'].values
    
    indexes = np.triu_indices(len(sku_array), k=1)
    
    sku_comparison = np.where(sku_array[indexes[0]] == sku_array[indexes[1]], 'Match', 'Non-Match')
    name_comparison = np.where(name_array[indexes[0]] == name_array[indexes[1]], 'Match', 'Non-Match')
    
    comparison = np.where(np.logical_or(sku_comparison == 'Match', name_comparison == 'Match'), 'Match', 'Non-Match')
    
    result_df = pd.DataFrame({'Label': comparison,
                            'Left Product Title': name_array[indexes[0]],
                            'Left Manufacturer': brand_array[indexes[0]],
                            'Left Price': price_array[indexes[0]],
                            'Left Description': description_array[indexes[0]],
                            'Right Product Title': name_array[indexes[1]],
                            'Right Manufacturer': brand_array[indexes[1]],
                            'Right Price': price_array[indexes[1]],
                            'Right Description': description_array[indexes[1]]})
    return result_df

In [4]:
df_deepmatcher = csv_to_deepmatcher_pattern('../data/interim/second data pre-processing/products_deepmatcher.csv')
print(df_deepmatcher['Label'].value_counts())

Non-Match    45433570
Match             708
Name: Label, dtype: int64


## Undersampling/Save Separated Datas

In [7]:
# Undersampling
df_majority = df_deepmatcher[df_deepmatcher['Label'] == 'Non-Match']
df_minority = df_deepmatcher[df_deepmatcher['Label'] == "Match"]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=1000, random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority])

X = df_balanced.drop("Label", axis=1)
y = df_balanced["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.to_csv("../data/processed/undersampling/X_train.csv", index=False)
X_test.to_csv("../data/processed/undersampling/X_test.csv", index=False)
y_train.to_csv("../data/processed/undersampling/y_train.csv", index=False)
y_test.to_csv("../data/processed/undersampling/y_test.csv", index=False)