In [44]:
#importing all essential libs
import pdftotext
import os
import fasttext
import re
from collections import Counter
from operator import itemgetter

In [37]:
#provide the path to the pdfs and model of the language identifier.
FOLDER_PATH = "./data/"
MODEL_PATH = "./model/lid218e.bin"

In [45]:
#get all the pdfs in a given folder
def get_pdf_names(folder_path):
    # Get a list of all files in the specified folder
    files_in_folder = os.listdir(folder_path)
    
    # Filter out files that end with '.pdf'
    pdf_files = [file for file in files_in_folder if file.lower().endswith('.pdf')]
    
    return pdf_files

def get_tokens(pdf):
    #lets iterate page by page and get tokens based on the provide regex.
    #remove newline character and replace with space
    tokens = []
    for page in pdf:
        page = page.replace('\r\n'," ")
        tokens_temp = re.findall(r"\w+|[^\w\s]+", page)
        tokens.extend(tokens_temp)
    return tokens

def calculate_percentages(item_list):
    # Count the frequency of each item in the list
    item_counts = Counter(item_list)
    
    # Get the total number of items in the list
    total_items = len(item_list)
    
    # Calculate the percentage of each item and round to 2 decimal places
    item_percentages = {item: round((count / total_items) * 100, 2) for item, count in item_counts.items()}
    
    return item_percentages

def get_top_key_by_value(dictionary):
    # Sort the dictionary by values in descending order
    sorted_dict = sorted(dictionary.items(), key=itemgetter(1), reverse=True)
    
    # Get the top key with the highest value
    top_key = sorted_dict[0][0] if sorted_dict else None  # Handle empty dictionary case
    
    return top_key

## Steps followed
We will follow a three step procedure to identify the language of the document.

 1. Convert the document (expected as pdf) to text using the `pdftotext` module. This module was fairly straight forward to install before, now it does give some trouble, but I was able install it using the instructions provided in the link [here](https://github.com/jalan/pdftotext).
 2. We shall remove all the empty new lines and then split the text in using the following regular expression `\w+|[^\w\s]+` this is one of the simplest pretokenizers used by HuggingFace, which splits in whitespaces.
 3. Identify the majority language in the token using the `fasttext` model released by the [NLLB team](https://github.com/facebookresearch/fairseq/tree/nllb?tab=readme-ov-file). You can download the 1GB model using this link [here](https://tinyurl.com/nllblid218e).

Finally we report what the majority language was and its percentage.

In [29]:
#get all the pdfs and their paths
pdf_names = get_pdf_names(FOLDER_PATH)
#lets create the paths using the names
pdf_paths = [ os.path.join(FOLDER_PATH,name) for name in pdf_names]
#lets print the paths
for path in pdf_paths:
    print(path)

#lets load up the model
model = fasttext.load_model(MODEL_PATH)

./data/7-Mathematical-Foundations-Complete.pdf




In [48]:
#lets now iterate by each pdf and get its contents
for path in pdf_paths:
    print("#------------------------------------------------------------------------------------#")
    with open(path, "rb") as f:
        pdf = pdftotext.PDF(f)
    print(f'{path.split("/")[-1]} has {len(pdf)} pages.')

    #lets get tokens from this document
    tokens = get_tokens(pdf)

    #lets iterate through each token and get
    #a language assigned to it.
    #since this is independent for each item
    #we can parallelize using map function
    languages = list(map(lambda x: model.predict(x)[0][0].replace("__label__",""), tokens))
    #lets get the percentages
    percentages = calculate_percentages(languages)
    #lets sort the dictionary and get the language with the highest percentage
    top_lang = get_top_key_by_value(percentages)
    print(f'document {path.split("/")[-1]} contains {percentages[top_lang]}% {top_lang}')

    print("#------------------------------------------------------------------------------------#")
    

#------------------------------------------------------------------------------------#
7-Mathematical-Foundations-Complete.pdf has 154 pages.
document 7-Mathematical-Foundations-Complete.pdf contains 79.01% eng_Latn
#------------------------------------------------------------------------------------#
