# Map shortened headers to full headers

The following experimental code attempts to automatically map shortened headers to full headers. It is based on the assumption that the shortened headers are unique and that the full headers are unique. The code uses Levenshtein distance with added weights to find the best match between the shortened headers and the full headers.

The weights are added by modifying the original strings by multiplying the number of characters having higher weight.

In [1]:
from itertools import permutations

import Levenshtein

In [2]:
def multiply_characters_at_beginning(string: str, n: int = 3) -> str:
    """Multiply the first n characters of a string by n + 1 - index."""
    multiplied_start = ''.join(
        character * (n - index + 1)
        for index, character in enumerate(string[:n])
    )
    return multiplied_start + string[n:]


def multiply_consonants(string: str, n: int = 2) -> str:
    """Multiply consonants in a string by n."""
    for consonant in 'bcdfghjklmnpqrstvwxz':
        string = string.replace(consonant, consonant * n)
    return string


def remove_repeated_separators(string: str, separators: str = ' ,;.:-_|') -> str:
    """Remove repeated separators in a string."""
    for separator in separators:
        while True:
            string_replaced = string.replace(separator * 2, separator)
            if string_replaced == string:
                break
            string = string_replaced
    return string


def normalize_str(string: str) -> str:
    """Transform a string to a normalized form later used for mapping using Levenshtein distance."""
    string_normalized = string.strip().lower()
    string_normalized = remove_repeated_separators(string_normalized)
    string_normalized = multiply_characters_at_beginning(string_normalized)
    string_normalized = multiply_consonants(string_normalized)
    return string_normalized


def map_strings(source: list[str], destination: list[str]) -> dict[str, str]:
    """Map from source to destination list with minimum Levenshtein distance."""
    min_distance = float('inf')
    min_mapping = None
    source_normalized = [normalize_str(string) for string in source]
    destination_normalized = [normalize_str(string) for string in destination]
    filler = [None] * abs(len(source_normalized) - len(destination_normalized))
    print(source_normalized)
    print(destination_normalized)
    if len(destination_normalized) < len(source_normalized):
        destination_normalized += filler
    for permutation in permutations(
            range(len(destination_normalized)), len(source_normalized)):
        distance = sum(
            0 if destination_normalized[permutation[i]] is None
            else Levenshtein.distance(
                source_normalized[i],
                destination_normalized[permutation[i]])
            for i in range(len(source_normalized))
        )
        if distance < min_distance:
            min_distance = distance
            min_mapping = permutation
    return {
        source[i]: (destination + filler)[min_mapping[i]]
        for i in range(len(source))
    }

In [3]:
map_strings(
    ['prt', 'prtcl', 'src', 'dst'],
    ['Source', 'Destination', 'Protocol', 'Port', 'Act---ion']
    )

['pppppppprrrrrrtttt', 'pppppppprrrrrrttttccll', 'ssssssssrrrrrrcccc', 'ddddddddsssssstttt']
['ssssssssooouurrcce', 'ddddddddeeessssttinnattionn', 'pppppppprrrrrroottoccoll', 'ppppppppooorrrrtt', 'aaaacccccctttt-ionn']


{'prt': 'Port', 'prtcl': 'Protocol', 'src': 'Source', 'dst': 'Destination'}