# Malayalam Audio Detection

In [None]:
import speech_recognition as sr  

In [None]:
r = sr.Recognizer()  
with sr.Microphone() as source:  
    print("Please wait. Calibrating microphone...")   
    r.adjust_for_ambient_noise(source, duration=5)  
    print("Say something!") 
    audio = r.listen(source) 
    try:  
        audio=r.recognize_google(audio, language="ml-IN")
        print("you said '" +audio + "'")  
    except sr.UnknownValueError:  
        print("could not understand audio")  
    except sr.RequestError as e:  
        print("error; {0}".format(e))

# Malayalam-English Translator

In [None]:
#!pip install indic-nlp-library
#!pip install ctranslate2

In [None]:
import os
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.transliterate import unicode_transliterate
import ctranslate2
import sentencepiece as spm

In [None]:
def add_token(sent: str):

    return "mal_Mlym" + " " + "eng_Latn" + " " + sent

def preprocess_sentence(sentence: str,sp_src,model_dir) -> str:

    normfactory = IndicNormalizerFactory()
    normalizer = normfactory.get_normalizer("ml")  
    sent=sentence
    xliterator = unicode_transliterate.UnicodeIndicTransliterator()
    processed_sent = xliterator.transliterate(" ".join(indic_tokenize.trivial_tokenize(normalizer.normalize(sent.strip()), "ml")),"ml",
                "hi",
            ).replace(" ् ", "्")
    sents= [" ".join(sp_src.encode(sent, out_type=str)) for sent in [processed_sent]]

    tagged_sents = []
    for sent in sents:
        tagged_sent = add_token(sent.strip())
        tagged_sents.append(tagged_sent)
    MAX_SEQ_LEN = 256
    new_sents = []

    for sent in tagged_sents:
        words = sent.split()
        num_words = len(words)
        if num_words > MAX_SEQ_LEN:
            print_str = " ".join(words[:5]) + " .... " + " ".join(words[-5:])
            sent = " ".join(words[:MAX_SEQ_LEN])
            print(
                f"WARNING: Sentence {print_str} truncated to 256 tokens as it exceeds maximum length limit"
            )

        new_sents.append(sent)

    translator = ctranslate2.Translator(model_dir, device="cpu")
    tokenized_sents = [x.strip().split(" ") for x in new_sents]
    translations = translator.translate_batch(
            tokenized_sents,
            max_batch_size=9216,
            batch_type="tokens",
            max_input_length=160,
            max_decoding_length=256,
            beam_size=5,
        )
    translations = [" ".join(x.hypotheses[0]) for x in translations]
    for i in range(len(translations)):
        translations[i] = translations[i].replace(" ", '').replace("▁", " ").strip()
    
    return translations[0]

def translate_sentence(sentence: str, model_dir: str) :
  
    sp_src = spm.SentencePieceProcessor(model_file=os.path.join(model_dir, "vocab", "model.SRC"))
    
    preprocessed_sentence = preprocess_sentence(sentence,sp_src,model_dir)
    
    return preprocessed_sentence

model_dir = "final_model"
malayalam_sentence = audio

translated_sentence = translate_sentence(malayalam_sentence, model_dir)
print("Translated sentence:", translated_sentence)