# Translation of English captions to Malayalam



This notebook facilitates the translation of image captions from English to Malayalam using the Google Cloud Translation API. The core steps encompass:
1. Configuring Google Cloud authentication.
2. Reading and parsing English captions.
3. Translating captions to Malayalam.
4. Storing the translated captions in an output file.


## Library Imports
Necessary libraries are imported for tasks such as file management, text processing, and accessing Google Cloud Translation services.


In [1]:
# Import necessary libraries
import os
from google.cloud import translate_v2 as gtranslate
import re

## Utility Functions
The following are utility functions designed for file reading, text cleaning, and text translation using the Google Cloud API.


In [None]:

def set_google_credentials(credentials_path):
    """
    Set up the Google Cloud credentials for authentication.

    Args:
    - credentials_path (str): Path to the Google Cloud service account key.
    """
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

In [None]:

def read_file_content(filepath):
    """
    Read and return the content of a specified file.

    Args:
    - filepath (str): Path to the file.

    Returns:
    - str: Content of the file.
    """
    with open(filepath, 'r') as f:
        return f.read()

In [None]:
def parse_image_descriptions(content):
    """Parse image descriptions from the given content.

    Args:
        content (str): String content containing image IDs and descriptions.

    Returns:
        dict: A dictionary with image IDs as keys and a list of descriptions as values.
    """

    # Initialize an empty dictionary to hold image IDs and their corresponding descriptions.
    descriptions_dict = {}

    # Split content into lines and iterate through each line.
    for line in content.strip().split('\n'):

        # Split each line by spaces to separate image ID and description.
        tokens = line.split()

        # Skip lines that don't have at least an image ID and a description.
        if len(tokens) < 2:
            continue

        # Extract the image ID (removing any file extension) and the associated description.
        img_id = tokens[0].split('.')[0]
        img_description = ' '.join(tokens[1:])

        # Add the description to the list of descriptions for the current image ID.
        descriptions_dict.setdefault(img_id, []).append(img_description)

    return descriptions_dict


In [None]:

def clean_text(text):
    """
    Clean and preprocess the text.

    Args:
    - text (str): Text to clean.

    Returns:
    - str: Cleaned and preprocessed text.
    """
    # Convert the text to lowercase for uniformity
    text = text.lower()

    # Replace all non-word characters (anything other than numbers and letters) with spaces
    text = re.sub(r'\W+', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove leading and trailing spaces, if any
    return text.strip()


In [None]:

def get_translation(text, language):
    """
    Translate the given text into the specified language using the Google Cloud Translation API.

    Args:
    - text (str): Text to translate.
    - language (str): Target language code (e.g., 'ml' for Malayalam).

    Returns:
    - str: Translated text. Returns None if an error occurs.
    """

    # Create a translation client using the Google Cloud Translation API
    client = gtranslate.Client()

    try:
        # Attempt to translate the provided text into the target language
        result = client.translate(text, target_language=language)

        # Return the translated text from the result
        return result['translatedText']
    except Exception as e:
        # If an error occurs during translation, print the error message
        print(f"Translation Error: {str(e)}")

        # Return None to indicate that the translation failed
        return None


In [None]:


def translate_all_descriptions(descriptions, target_lang):
    """
    Translate all descriptions in the given dictionary to the target language.

    Args:
    - descriptions (dict): Dictionary of image descriptions.
    - target_lang (str): Target language code (e.g., 'ml' for Malayalam).

    Returns:
    - dict: Dictionary with image IDs as keys and lists of translated descriptions as values.
    """

    # Initialize an empty dictionary to store translated descriptions
    translated_dict = {}

    # Enumerate through the provided descriptions dictionary
    for idx, (img_id, desc_list) in enumerate(descriptions.items()):

        # Create a list to store translated descriptions for the current image
        translated_list = []

        # Loop through each description in desc_list
        for desc in desc_list:

            # Clean and translate the description
            translated_desc = get_translation(clean_text(desc), target_lang)

            # Append the translated description to the list if the translation is successful
            if translated_desc:
                translated_list.append(translated_desc)

        # Store the translated descriptions in the translated_dict
        translated_dict[img_id] = translated_list

        # Save intermediate results every 200 translations to prevent data loss during long runs
        if idx % 200 == 0:
            write_to_file(translated_dict, f"Flickr30k.{target_lang}.temp.{idx}.txt")

    # Return the dictionary with translated descriptions
    return translated_dict


In [None]:

def write_to_file(descriptions, filepath):
    """
    Write the given descriptions to a specified file.

    Args:
    - descriptions (dict): Dictionary of image descriptions.
    - filepath (str): Path to the output file.
    """

    # Construct a list of formatted strings containing the image ID, index, and description.
    # For each image ID (key) and its list of descriptions (desc_list),
    # enumerate through the descriptions to get both the description and its index.
    lines = []
    for key, desc_list in descriptions.items():
        for idx, desc in enumerate(desc_list):
            lines.append(f"{key}.jpg#{idx} {desc}")

    # Open the specified file in write mode with UTF-8 encoding
    with open(filepath, 'w', encoding="utf8") as f:

        # Join the lines with newline characters and write to the file
        f.write('\n'.join(lines))


## Main Execution Function
This function organizes the entire translation workflow by invoking the aforementioned utility functions in the appropriate sequence.


In [None]:

def main(english_dataset_path, credentials_path, output_path, target_lang="ml"):
    """
    Main execution function to translate image descriptions.

    Args:
    - english_dataset_path (str): Path to the file containing English image descriptions.
    - credentials_path (str): Path to the Google Cloud service account key.
    - output_path (str): Path to the output file for translated descriptions.
    - target_lang (str, optional): Target language code. Defaults to 'ml' (Malayalam).
    """
    # Set up Google Cloud credentials
    set_google_credentials(credentials_path)

    # Read English descriptions
    english_descriptions = read_file_content(english_dataset_path)

    # Extract descriptions from file
    descriptions = parse_image_descriptions(english_descriptions)

    # Translate the english descriptions
    translated_descriptions = translate_all_descriptions(descriptions, target_lang)

    # Save the translated descriptions to the output file
    write_to_file(translated_descriptions, output_path)


## Execution
Paths to datasets, Google Cloud credentials, and the desired output file location are specified below. Following this setup, the main function is called to initiate the translation process.


In [None]:
# Setting up the paths to different files
path_to_english_dataset = "/Users/allumariababu/Documents/MSc_Project/Flickr30k_Dataset/30k_captions.txt"

path_to_google_credentials = "/Users/allumariababu/Documents/MSc_Project/Google Credentials/servicekey.json"

path_to_output_file = "/Users/allumariababu/Documents/MSc_Project/Flickr30k_Dataset/Malayalam_Dataset/Flickr30k.malayalam.txt"


In [None]:

# Calling the main function to translate English captions to Malayalam
main("path_to_english_dataset", "path_to_google_credentials", "path_to_output_file")