<a href="https://colab.research.google.com/github/vedantb99/Soundex-Hindi/blob/main/Hindi_Soundex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Soundex algorithm for Hindi Language

Storing Vowels & Consonants for lookup

In [None]:
vowels =  ['ऀ', 'ँ', 'ं', 'ः', 'ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ', 
           'ऺ', 'ऻ', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', 'ॎ', 'ॏ']
consonants = ['क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ',
              'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'व', 'श', 'ष', 'स', 'ह']

**Creating a dictionary to map letters to their groups**  
क, ख, ग, घ, ङ => Group १  
च, छ, ज, झ, ञ => Group २  
ट, ठ, ड, ढ, ण => Group ३  
त, थ, द, ध, न, ऩ => Group ४  
प, फ, ब, भ, म => Group ५  
य, र, ऱ, ल, व, श => Group ६  
ष, स, ह => Group ७

In [None]:
map = {
    '१':['क', 'ख', 'ग', 'घ', 'ङ'], '२':['च', 'छ','ज', 'झ', 'ञ'], '३':['ट', 'ठ', 'ड', 'ढ', 'ण'], '४':['त', 'थ', 'द', 'ध', 'न', 'ऩ' ],
     '५':['प', 'फ', 'ब', 'भ', 'म'], '६':['य', 'र', 'ऱ', 'ल', 'व', 'श'], '७': ['ष', 'स', 'ह' ] 
}

Replacing all vowels with a hyphen (-)

In [None]:
def replaceVowels(s):
    res = ""
    for x in s:
        if (x in vowels):
            res += '-'
        else:
            res += x
    return res    

Encoding consonants by their groups

In [None]:
def encodeByGroup(s):
    res = ""
    for x in s:
        if x == '-':
            res += x
            continue
        for group_no in map:
            if x in map[group_no]:
                res += group_no
    return res

Removing consecutive group codes which are same


In [None]:
from itertools import groupby
def removeDuplicates(s):
    test_list = []
    test_list[:0] = s
    res_list = [i[0] for i in groupby(test_list)]
    res = ''.join([str(elem) for elem in res_list])
    return res

Removing all the hyphens from the word which were substituted for vowels

In [None]:
def removeHyphens(s):
    res = ""
    for x in s:
        if x != '-':
            res += x
    return res

Soundex Main Function

In [None]:
def HSoundex(word):
    first_letter = word[0]

    word = word.replace('्','')
    word = first_letter + replaceVowels(word[1:])

    word = first_letter + encodeByGroup(word[1:])

    word = first_letter + removeDuplicates(word[1:])

    if first_letter in consonants:
        group_fl = 0
        for group_no in map:
            if first_letter in map[group_no]:
                group_fl = group_no
        if group_fl == word[1]:
            word = first_letter + word[2:]

    coded_word = removeHyphens(word)

    if len(coded_word) > 4:
        coded_word = coded_word[:4]
    elif len(coded_word) < 4:
        while len(coded_word) != 4:
            coded_word += "०"

    return coded_word

In [None]:
print(HSoundex("समान"))
print(HSoundex("सम्मान"))

स५४०
स५४०


In [None]:
print(HSoundex("आँधी"))
print(HSoundex("अन्याय"))

आ४००
अ४६६


In [None]:
print(HSoundex("समवेदना"))
print(HSoundex("संवेदना"))

स५६४
स६४०


In [None]:
print(HSoundex("दिन"))
print(HSoundex("दीन"))

द४००
द४००


In [None]:
print(HSoundex("सास"))
print(HSoundex("साँस"))

स७००
स७००


In [None]:
print(HSoundex("परिणाम"))
print(HSoundex("परिमाण"))

प६३५
प६५३


In [None]:
print(HSoundex("चिंता"))
print(HSoundex("चीता"))

च४००
च४००


In [None]:
print(HSoundex("ग्रह"))
print(HSoundex("गृह"))

ग६७०
ग७००


In [None]:
print(HSoundex("अपेक्षा"))
print(HSoundex("उपेक्षा"))

अ५१७
उ५१७


In [None]:
print(HSoundex("बाग"))
print(HSoundex("बाघ"))

ब१००
ब१००


In [None]:
print(HSoundex("वस्तु"))
print(HSoundex("वास्तु"))

व७४०
व७४०
