# Model Evaluation
## Overview
This notebook demonstrate to evaluate model using human judgement.

### Objective
- Find best vectorizer
- Find best similarity threshold

## Installation
Run following command to clone repository.

In [None]:
! git clone https://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@github.com/carloabimanyu/dsw-data-challenge-2023.git
! pip install sparse-dot-topn

### Import library and define constants

In [1]:
colab_path = '/content/dsw-data-challenge-2023/'

import sys
sys.path.append('../')
sys.path.append(colab_path)

import pandas as pd

from src import utils
from src.product import Product

config = utils.load_config()

config['catalog_data_processed_path'] = '../' + config['catalog_data_processed_path']
config['pos_data_processed_path'] = '../' + config['pos_data_processed_path']
config['external_data_processed_path'] = '../' + config['external_data_processed_path']
config['catalog_external_processed_path'] = '../' + config['catalog_external_processed_path']

# UNCOMMENT THIS IF RUN IN COLAB
# config['catalog_data_processed_path'] = colab_path + config['catalog_data_processed_path']
# config['pos_data_processed_path'] = colab_path + config['pos_data_processed_path']
# config['external_data_processed_path'] = colab_path + config['external_data_processed_path']
# config['catalog_external_processed_path'] = colab_path + config['catalog_external_processed_path']

### Load dataset

In [2]:
catalog = utils.pickle_load(config['catalog_data_processed_path'])
pos = utils.pickle_load(config['pos_data_processed_path'])
external = utils.pickle_load(config['external_data_processed_path'])
catalog_external = utils.pickle_load(config['catalog_external_processed_path'])

In [5]:
catalog_external.head(2)

Unnamed: 0,Product SKU,Brand,Type,Formula
0,Urea Petro,PIHC,Urea,
1,Urea PIM,PIHC,Urea,


In [6]:
pos.head(2)

Unnamed: 0,Product SKU,Brand,Type,Formula,Metrics,Full Name
0,Pupuk Urea N,,,,46%,Pupuk Urea N 46%
1,Pupuk Amonium Sulfat ZA,,,,,Pupuk Amonium Sulfat ZA


## Evaluation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from src.similarity import spdt
from src.similarity.ngrams import ngrams

In [7]:
# Define data
data = pd.concat(
    [
        catalog_external['Product SKU'],
        pos['Product SKU']
    ], ignore_index=True
)

In [45]:
vectorizer = TfidfVectorizer(min_df=2, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(data)

matches = spdt.awesome_cossim_top(
    tf_idf_matrix,
    tf_idf_matrix.transpose(),
    ntop=2,
    lower_bound=0.7
)

matches_df = spdt.get_matches_df(matches, data, top=44379)
matches_df = matches_df[matches_df['similarity'] < 0.9999]

matches_df['left_in_catalog'] = matches_df['left_side'].apply(lambda x: 1 if x in catalog_external['Product SKU'].unique() else 0)
matches_df['right_in_catalog'] = matches_df['right_side'].apply(lambda x: 1 if x in catalog_external['Product SKU'].unique() else 0)

matches_df = matches_df[
    ((matches_df['left_in_catalog'] == 0) &
    (matches_df['right_in_catalog'] == 1)) 
    |
    (matches_df['left_in_catalog'] == 1) &
    (matches_df['right_in_catalog'] == 0)
]

In [46]:
matches_df.sort_values(by=['similarity'])

Unnamed: 0,left_side,right_side,similarity,left_in_catalog,right_in_catalog
486,MerokeFITOFLEX,meroke FITOFLEX,0.702422,1,0
620,Jon UP 480 SL,jon up,0.702729,1,0
215,CAP TAWON 12-12-17-2Mg,12-12-17-2,0.704162,1,0
9356,Kuriza,PUPUK KURIZA KUJANG,0.704188,0,1
449,PUPUK KURIZA KUJANG,Kuriza,0.704188,1,0
...,...,...,...,...,...
427,PUPUK NPK PUSRI SINGKONG 17-6-25,NPK Pusri Singkong 17-6-25,0.913948,1,0
415,PUPUK NPK PUSRI 12-12-17-2,NPK Pusri 12-12-17-2,0.915499,1,0
421,PUPUK NPK KEBOMAS 15-15-6,pupuk npk kebomas 15-15-15,0.918789,1,0
4583,Magnesium Sulfate SU,Magnesium Sulfate,0.927407,0,1
