In [10]:
import pandas as pd
import numpy as np

In [33]:
df1 = pd.DataFrame({"column1": ["BMW", "MERCEDES", "VW", "OPEL"]})
df2 = pd.DataFrame(
    {
        "column2": [
            "MERCEDES Benz Gmbh",
            "Volkswagen",
            "Opel Rüsselsheim",
            "Bayrische Motorenwerke",
            "OPER",
        ]
    }
)

In [72]:
class AutoStringMapper:
    
    def __init__(self, from_column, to_column, ignore_case=True):

        from_column = self.clean_column(from_column, "from_column")
        to_column = self.clean_column(to_column, "to_column")

        unique_from_column = from_column.drop_duplicates().reset_index(drop=True)
        unique_to_column = to_column.drop_duplicates().reset_index(drop=True)

        len_from_column = unique_from_column.shape[0]
        len_to_column = unique_to_column.shape[0]

        maxlen_from_column = unique_from_column.str.len().max()
        maxlen_to_column = unique_to_column.str.len().max()

        (
            from_column_combinations,
            to_column_combinations,
        ) = self.create_series_for_comparison(unique_from_column, unique_to_column)

        if ignore_case:
            from_column_combinations = from_column_combinations.str.lower()
            to_column_combinations = to_column_combinations.str.lower() 

        levenshtein_array = self.create_levenshtein_array(
            from_column_combinations,
            to_column_combinations,
            len_from_column,
            len_to_column,
            maxlen_from_column,
            maxlen_to_column,
        )

        distance_frame = pd.DataFrame(
            levenshtein_array[:, maxlen_from_column - 1, maxlen_to_column - 1].reshape(
                [len_to_column, len_from_column]
            )
        )

        divisor_frame = self.create_divisor_frame(unique_from_column, unique_to_column)

        self.similarity_frame = 1 - (distance_frame / divisor_frame)

        self.similarity_frame.index = unique_to_column.to_list()
        self.similarity_frame.columns = unique_from_column.to_list()

    def get_mapping(self, similarity_threshold=0.0):
            
        if similarity_threshold < 0.0 or similarity_threshold > 1.0:
            raise ValueError("Parameter similarity_threshold must be between 0 and 1")
            
        mapping = pd.DataFrame(self.similarity_frame.idxmax(axis=0))
        
        mapping[0] = mapping[0].where(self.similarity_frame.max(axis=0) >= similarity_threshold, np.nan)

        return mapping.to_dict()[0]

    @staticmethod
    def clean_column(column, column_name):
        if type(column) == np.ndarray:
            return pd.Series(column).astype(str)
        elif type(column) == list:
            return pd.Series(column).astype(str)
        elif type(column) == pd.Series:
            return column.astype(str)
        else:
            raise ValueError(
                f"{column_name} not of type numpy.ndarray, pandas.Series or list"
            )

    @staticmethod
    def create_series_for_comparison(column1, column2):

        new_column1 = pd.Series(column1.tolist() * column2.shape[0])
        new_column2 = []
        for element in column2.tolist():
            list_element = [element] * column1.shape[0]
            new_column2 = new_column2 + list_element

        new_column2 = pd.Series(new_column2)

        return new_column1, new_column2

    @staticmethod
    def create_levenshtein_array(
        column1, column2, len_column1, len_column2, maxlen_column1, maxlen_column2
    ):

        levenshtein_array = np.zeros(
            [len_column1 * len_column2, maxlen_column1, maxlen_column2], "int16"
        )

        for column1_index in range(maxlen_column1):
            for column2_index in range(maxlen_column2):

                if column1_index == 0:

                    insertion = np.array(
                        [np.iinfo("int16").max] * len_column1 * len_column2
                    )

                else:

                    insertion = levenshtein_array[
                        :, column1_index - 1, column2_index
                    ] + (~pd.isnull(column1.str[column1_index])).astype("int16")

                if column2_index == 0:

                    deletion = np.array(
                        [np.iinfo("int16").max] * len_column1 * len_column2
                    )

                else:

                    deletion = levenshtein_array[
                        :, column1_index, column2_index - 1
                    ] + (~pd.isnull(column2.str[column2_index])).astype("int16")

                if column1_index == 0 or column2_index == 0:

                    replacement = np.array(
                        [np.iinfo("int16").max] * len_column1 * len_column2
                    )

                    if column1_index == 0 and column2_index == 0:

                        comparison = (
                            column1.str[column1_index] != column2.str[column2_index]
                        )
                        replacement = comparison.astype("int16")

                else:

                    comparison = (
                        column1.str[column1_index] != column2.str[column2_index]
                    )
                    replacement = levenshtein_array[
                        :, column1_index - 1, column2_index - 1
                    ] + comparison.astype("int16")

                levenshtein_array[:, column1_index, column2_index] = np.array(
                    [insertion, deletion, replacement]
                ).min(axis=0)

        return levenshtein_array

    @staticmethod
    def create_divisor_frame(column1, column2):

        divisor_frame_1 = pd.concat([column1.str.len()] * column2.shape[0], axis=1).T
        divisor_frame_2 = pd.concat([column2.str.len()] * column1.shape[0], axis=1)

        divisor_frame_1 = divisor_frame_1.T.reset_index(drop=True).T
        divisor_frame_1.reset_index(drop=True, inplace=True)

        divisor_frame_2 = divisor_frame_2.T.reset_index(drop=True).T
        divisor_frame_2.reset_index(drop=True, inplace=True)

        divisor_frame = (
            pd.concat([divisor_frame_1, divisor_frame_2])
            .groupby(level=0)
            .max()
            .astype("float32")
        )

        return divisor_frame

In [73]:
df1["column1"]

0         BMW
1    MERCEDES
2          VW
3        OPEL
Name: column1, dtype: object

In [74]:
df2["column2"]

0        MERCEDES Benz Gmbh
1                Volkswagen
2          Opel Rüsselsheim
3    Bayrische Motorenwerke
4                      OPER
Name: column2, dtype: object

In [75]:
AutoStringMapper(from_column=df1["column1"], to_column=df2["column2"]).get_mapping(similarity_threshold=0.2)

{'BMW': nan, 'MERCEDES': 'MERCEDES Benz Gmbh', 'VW': nan, 'OPEL': 'OPER'}

In [76]:
AutoStringMapper(from_column=df1["column1"], to_column=df2["column2"]).similarity_frame

Unnamed: 0,BMW,MERCEDES,VW,OPEL
MERCEDES Benz Gmbh,0.055556,0.444444,0.0,0.055556
Volkswagen,0.1,0.1,0.2,0.1
Opel Rüsselsheim,0.0,0.25,0.0,0.25
Bayrische Motorenwerke,0.136364,0.181818,0.045455,0.045455
OPER,0.0,0.125,0.0,0.75


In [69]:
.where(pd.DataFrame(AutoStringMapper(df1["column1"], df2["column2"]).similarity_frame.max(axis=0)) < 0.2, np.nan)

Unnamed: 0,0
BMW,True
MERCEDES,False
VW,True
OPEL,False
