In [57]:
import pandas as pd
file = '../Data/train/text/bank-card-final.txt'
with open(file, 'r') as file:
    lines = file.readlines()
    labels = list(map(lambda x: x.split(' ~ '), lines))
    label_df = pd.DataFrame(labels, columns=['audio', 'label'])


In [58]:
def convert_text_numbers_azerbaijani(text: str) -> str:
    # Basic mappings.
    units = {
        "sifir": 0,
        "bir": 1,
        "iki": 2,
        "üç": 3,
        "dörd": 4,
        "beş": 5,
        "altı": 6,
        "yeddi": 7,
        "səkkiz": 8,
        "doqquz": 9
    }
    tens = {
        "on": 10,
        "iyirmi": 20,
        "otuz": 30,
        "qırx": 40,
        "əlli": 50,
        "altmış": 60,
        "yetmiş": 70,
        "səksən": 80,
        "həştad": 80,
        "doxsan": 90
    }
    # For our purposes, only 'min' (1000) and 'yüz' (100) are allowed;
    # any 'milyon' would produce a number > 9999.
    scales = {
        "yüz": 100,
        "min": 1000,
        "milyon": 1000000
    }
    
    # A helper to parse a simple number (tens optionally followed by a unit)
    def parse_simple(tokens, i):
        if i < len(tokens) and tokens[i] in tens:
            val = tens[tokens[i]]
            consumed = 1
            if i + 1 < len(tokens) and tokens[i + 1] in units:
                val += units[tokens[i + 1]]
                consumed += 1
            return val, consumed
        elif i < len(tokens) and tokens[i] in units:
            return units[tokens[i]], 1
        return None, 0

    # This parser attempts to parse a number from tokens starting at index i,
    # but only combining groups if the final value would be <= 9999.
    # It first tries to see if a thousand (or million) group is present.
    # If the candidate multiplier would make the number exceed 9999,
    # then we do not consume that group and return the number built so far.
    def parse_number(tokens, i):
        total = 0
        consumed = 0
        
        # --- Check for optional million group ---
        # Since any million (even 1) makes a number > 9999, we simply do not
        # consume any tokens that would form a million-group.
        if i < len(tokens):
            if tokens[i] == "milyon":
                # implicit multiplier = 1 → 1*1000000 > 9999, so break.
                return total, consumed
            else:
                # Look ahead for a compound like "on milyon" etc.
                temp_val, temp_consumed = parse_simple(tokens, i)
                if temp_consumed and i + temp_consumed < len(tokens) and tokens[i + temp_consumed] == "milyon":
                    # Even 1 milyon is too big.
                    return total, consumed
        
        # --- Thousand group ---
        # First check if the token is "min" (implying 1*1000).
        if i < len(tokens) and tokens[i] == "min":
            if 1 * scales["min"] <= 9999:
                total += 1 * scales["min"]
                i += 1
                consumed += 1
            # If not valid (it always is in this case), we would leave it.
        else:
            # See if a compound number is followed by "min".
            temp_val, temp_consumed = parse_simple(tokens, i)
            if temp_consumed and i + temp_consumed < len(tokens) and tokens[i + temp_consumed] == "min":
                # Check if the thousand group would keep the number <= 9999.
                if temp_val * scales["min"] <= 9999:
                    total += temp_val * scales["min"]
                    i += temp_consumed + 1  # Consume the multiplier tokens plus "min"
                    consumed += temp_consumed + 1
                # Else, do not consume these tokens—this group will start a new number.
        
        # --- Hundred group ---
        if i < len(tokens):
            if tokens[i] == "yüz":
                if total + 1 * scales["yüz"] <= 9999:
                    total += 1 * scales["yüz"]
                    i += 1
                    consumed += 1
            else:
                temp_val, temp_consumed = parse_simple(tokens, i)
                if (temp_consumed and i + temp_consumed < len(tokens) and
                    tokens[i + temp_consumed] == "yüz" and temp_val < 10):
                    if total + temp_val * scales["yüz"] <= 9999:
                        total += temp_val * scales["yüz"]
                        i += temp_consumed + 1
                        consumed += temp_consumed + 1
        
        # --- Tens and units ---
        if i < len(tokens):
            temp_val, temp_consumed = parse_simple(tokens, i)
            if temp_consumed and total + temp_val <= 9999:
                total += temp_val
                i += temp_consumed
                consumed += temp_consumed
                
        return total, consumed

    # Split the input and parse sequentially.
    tokens = text.split()
    i = 0
    results = []
    # Also define the set of recognized number words.
    valid_words = set(units.keys()) | set(tens.keys()) | set(scales.keys())
    
    while i < len(tokens):
        if tokens[i] in valid_words:
            value, consumed = parse_number(tokens, i)
            # Only add if some tokens were consumed.
            if consumed:
                results.append(str(value))
                i += consumed
                continue
        i += 1
    
    return "".join(results)


# Example tests:
print(convert_text_numbers_azerbaijani("min iyirmi dörd sifir doxsan doqquz"))
# Expected output: "1024 0 99"

print(convert_text_numbers_azerbaijani("beş min doxsan beş yüz altmış altı min"))
# Expected output: "5095 166 1000"

# Test for the new requirement:
print(convert_text_numbers_azerbaijani("əlli doqquz min doqquz yüz səksən altı"))
# Expected output: "59 1986"


1024099
50951661000
591986


In [65]:
label_df['digits'] = label_df['label'].apply(convert_text_numbers_azerbaijani)

In [68]:
label_df[['audio', 'digits']].to_csv('bank-card.txt', sep='~', index=False, header=False)