In [1]:
#Feature Extraction

In [2]:
import re
import numpy as np

def get_rate(row):
    product1 = row.externalName
    product2 = row.GetirName
    regex = r'[0-9]+'
    numbers1 = set(re.findall(regex, product1))
    numbers2 = set(re.findall(regex, product2))
    union = numbers1.union(numbers2)
    intersection = numbers1.intersection(numbers2)
    if len(numbers1)==0 and len(numbers2) == 0:
        rate = 1
    else:
        rate = (len(intersection)/ len(union))
    return rate


def get_unique_number_count(row):
    product1 = row.externalName
    product2 = row.GetirName
    regex = r'[0-9]+'
    numbers1 = set(re.findall(regex, product1))
    numbers2 = set(re.findall(regex, product2))
    union = numbers1.union(numbers2)
    return len(union)


def levenshteinRecursive(seq1, seq2):

    if seq1 == "":

        return len(seq2)
    if seq2 == "":

        return len(seq1)
    if seq1[-1] == seq2[-1]:
        cost = 0
    else:
        cost = 1

    res = min([levenshteinRecursive(seq1[:-1], seq2) + 1,
               levenshteinRecursive(seq1, seq2[:-1]) + 1,
               levenshteinRecursive(seq1[:-1], seq2[:-1]) + cost])
    return res


def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )

    return (matrix[size_x - 1, size_y - 1])


def sorted_levenshtein_apply(row):
    product1 = ''.join(sorted(row.externalName))
    product2 = ''.join(sorted(row.GetirName))
    distance = levenshtein(product1, product2)
    return distance

def sorted_levenshtein_rate_apply(row):
    product1 = ''.join(sorted(row.externalName))
    product2 = ''.join(sorted(row.GetirName))
    distance = levenshtein(product1, product2)
    max_len = max(len(product1), len(product2))
    return 1-(distance/max_len)


def sorted_levenshtein(sequence1, sequence2):
    product1 = ''.join(sorted(sequence1))
    product2 = ''.join(sorted(sequence2))
    distance = levenshtein(product1, product2)
    return distance

def sorted_levenshtein_rate(seq1, seq2):
    product1 = ''.join(sorted(seq1))
    product2 = ''.join(sorted(seq2))
    distance = levenshtein(product1, product2)
    max_len = max(len(product1), len(product2))
    return 1-(distance/max_len)

def levenshtein_rate(product1, product2):
    distance = levenshtein(product1, product2)
    max_len = max(len(product1), len(product2))
    return 1 - (distance / max_len)


if __name__ == "__main__":
    met1 = "Selpak Kağıt Havlu 8'li"
    met2 = "Selpak Kağıt Havlu 12'li"
    print('Levenshtein Distance: {}, MatchScore: {} '.\
          format(levenshtein(met1, met2), levenshtein_rate(met1, met2)))
    print('Sorted Levenshtein Distance: {}, MatchScore: {} '.\
          format(sorted_levenshtein(met1, met2), sorted_levenshtein_rate(met1, met2)))

Levenshtein Distance: 2.0, MatchScore: 0.9166666666666666 
Sorted Levenshtein Distance: 2.0, MatchScore: 0.9166666666666666 


In [11]:
#DL and ML Algo
# computes the Levenshtein or Jaccard or XYZ distance between pair of words
# Steps :
# Description and Loading of Dataset.
# Measuring String Similarity (Levenshtein Distance & Sorted Levenshtein Distance)
# Issues with String Matching Measures
# Feature engineering on Dataset
# Prediction of Matches with Machine Learning (Perceptron, Logistic Regression, Support Vector Machines, Multilayer Perceptron)

In [6]:
# !pip install feature-extractor

Collecting feature-extractor
  Downloading feature_extractor-0.0.1-py3-none-any.whl (2.2 kB)
Installing collected packages: feature-extractor
Successfully installed feature-extractor-0.0.1


In [8]:
import pandas as pd
import re
import numpy as np
# import featureExtractors as fe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score
from sklearn.metrics import f1_score

df= pd.read_excel('getir.xlsx', sheet_name='getir')

df['levenshteinDistance'] = df.apply(sorted_levenshtein_apply, axis=1)
df['uniqueNumberCount'] = df.apply(get_unique_number_count, axis=1)+1
df['numberMatchRate'] = df.apply(get_rate, axis=1)
df['matchScore'] = df.apply(sorted_levenshtein_rate_apply, axis=1)
df['normalizedMatchRate'] = (df['numberMatchRate']+2).apply(np.log)
df['squaredPriceRate'] = df['priceRate']* df['priceRate']



X = df[['matchScore', 'squaredPriceRate', 'uniqueNumberCount', 'normalizedMatchRate']].values
y = df['Match'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,\
                                                    random_state=4, stratify=y)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

#model = Perceptron(max_iter=40, eta0=0.1, random_state=1)
#model = LogisticRegression(C=100.0, random_state=1)
#model = SVC(kernel='linear', C=1.0, random_state=1)
#model = SVC(kernel='rbf', random_state=1, gamma=5.0, C=1.0)

models = {'Perceptron' : Perceptron(max_iter=40, eta0=0.1, random_state=1),
 'LogisticRegression' : LogisticRegression(C=100.0, random_state=1),
 'LinearSVC' : SVC(kernel='linear', C=1.0, random_state=1),
 'KernelizedSVC' : SVC(kernel='rbf', random_state=1, gamma=5.0, C=1.0),
 'MLP' : MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)}

for model_name, model in models.items():
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)
    print(model_name)
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print('Fscore: %.2f' % f1_score(y_test, y_pred))
#    print(classification_report(y_test, y_pred, labels=[1, 0], target_names=['match', 'no-match']))
    print('')

Perceptron
Accuracy: 0.98
Fscore: 0.99

LogisticRegression
Accuracy: 0.98
Fscore: 0.99

LinearSVC
Accuracy: 0.98
Fscore: 0.99

KernelizedSVC
Accuracy: 0.95
Fscore: 0.97

MLP
Accuracy: 1.00
Fscore: 1.00

