In [2]:
import pandas as pd
import numpy as np


In [5]:
df1 = pd.DataFrame({"column1": ["BMW", "MERCEDES", "VW","OPEL"]})
df2 = pd.DataFrame({"column2": ["MERCEDES Benz Gmbh", "Volkswagen", "Opel Rüsselsheim", "Bayrische Motorenwerke", "OPER"]})

In [6]:
unique_column1 = df1["column1"].drop_duplicates().reset_index(drop=True)
unique_column2 = df2["column2"].drop_duplicates().reset_index(drop=True)
len_column1 = unique_column1.shape[0]
len_column2 = unique_column2.shape[0]
maxlen_column1 = unique_column1.str.len().max()
maxlen_column2 = unique_column2.str.len().max()

In [17]:
def create_series_for_comparison(column1, column2):
    
    new_column1 = pd.Series(column1.tolist() * column2.shape[0])
    new_column2 = []
    for element in column2.tolist():
        list_element = [element] * column1.shape[0]
        new_column2 = new_column2 + list_element

    new_column2 = pd.Series(new_column2)
    
    return new_column1, new_column2

column1, column2 = create_series_for_comparison(unique_column1, unique_column2)

In [149]:
def create_levenshtein_array(
    column1, column2, len_column1, len_column2, maxlen_column1, maxlen_column2
):

    levenshtein_array = np.zeros(
        [len_column1 * len_column2, maxlen_column1, maxlen_column2], "int16"
    )

    for column1_index in range(maxlen_column1):
        for column2_index in range(maxlen_column2):

            if column1_index == 0:

                insertion = np.array(
                    [np.iinfo("int16").max] * len_column1 * len_column2
                )

            else:

                insertion = levenshtein_array[:, column1_index - 1, column2_index] + (
                    ~pd.isnull(column1.str[column1_index])
                ).astype("int16")

            if column2_index == 0:

                deletion = np.array([np.iinfo("int16").max] * len_column1 * len_column2)

            else:

                deletion = levenshtein_array[:, column1_index, column2_index - 1] + (
                    ~pd.isnull(column2.str[column2_index])
                ).astype("int16")

            if column1_index == 0 or column2_index == 0:

                replacement = np.array(
                    [np.iinfo("int16").max] * len_column1 * len_column2
                )

                if column1_index == 0 and column2_index == 0:

                    comparison = (
                        column1.str[column1_index] != column2.str[column2_index]
                    )
                    replacement = comparison.astype("int16")

            else:

                comparison = column1.str[column1_index] != column2.str[column2_index]
                replacement = levenshtein_array[
                    :, column1_index - 1, column2_index - 1
                ] + comparison.astype("int16")

            levenshtein_array[:, column1_index, column2_index] = np.array(
                [insertion, deletion, replacement]
            ).min(axis=0)

    return levenshtein_array


levenshtein_array = create_levenshtein_array(
    column1, column2, len_column1, len_column2, maxlen_column1, maxlen_column2
)

In [150]:
distance_frame = pd.DataFrame(
    levenshtein_array[:, maxlen_column1 - 1, maxlen_column2 - 1].reshape(
        [len_column2, len_column1]
    )
)

In [151]:
distance_frame

Unnamed: 0,0,1,2,3
0,18,10,18,17
1,10,10,9,10
2,16,15,16,15
3,20,22,22,22
4,4,7,4,1


In [158]:
def create_divisor_frame(column1, column2):

    divisor_frame_1 = pd.concat([column1.str.len()] * column2.shape[0], axis=1).T
    divisor_frame_2 = pd.concat([column2.str.len()] * column1.shape[0], axis=1)

    divisor_frame_1 = divisor_frame_1.T.reset_index(drop=True).T
    divisor_frame_1.reset_index(drop=True, inplace=True)

    divisor_frame_2 = divisor_frame_2.T.reset_index(drop=True).T
    divisor_frame_2.reset_index(drop=True, inplace=True)

    divisor_frame = pd.concat([divisor_frame_1, divisor_frame_2]).groupby(level=0).max().astype("float32")

    return divisor_frame

divisor_frame = create_divisor_frame(unique_column1, unique_column2)

In [159]:
similarity_frame = 1 - (distance_frame / divisor_frame)

In [160]:
divisor_frame

Unnamed: 0,0,1,2,3
0,18.0,18.0,18.0,18.0
1,10.0,10.0,10.0,10.0
2,16.0,16.0,16.0,16.0
3,22.0,22.0,22.0,22.0
4,4.0,8.0,4.0,4.0


In [161]:
similarity_frame.index = unique_column2.to_list()
similarity_frame.columns = unique_column1.to_list()

In [166]:
mapping = pd.DataFrame(similarity_frame.idxmax(axis=1)).to_dict()[0]

In [167]:
mapping

{'MERCEDES Benz Gmbh': 'MERCEDES',
 'Volkswagen': 'VW',
 'Opel Rüsselsheim': 'MERCEDES',
 'Bayrische Motorenwerke': 'BMW',
 'OPER': 'OPEL'}

In [168]:
similarity_frame

Unnamed: 0,BMW,MERCEDES,VW,OPEL
MERCEDES Benz Gmbh,0.0,0.444444,0.0,0.055556
Volkswagen,0.0,0.0,0.1,0.0
Opel Rüsselsheim,0.0,0.0625,0.0,0.0625
Bayrische Motorenwerke,0.090909,0.0,0.0,0.0
OPER,0.0,0.125,0.0,0.75
