<a href="https://colab.research.google.com/github/zenon10/POC-OCR/blob/main/text_translation_wrapper_rmv20230215.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

In [None]:
!python3 -m spacy download fr_core_news_sm

!pip install transformers[sentencepiece]

In [7]:
import pandas as pd
import os
import spacy

import fr_core_news_sm

from transformers import pipeline

In [None]:
spacy_model = fr_core_news_sm.load()
translation_model = pipeline("text2text-generation", model='Helsinki-NLP/opus-mt-fr-en')

## Functions

In [9]:
def text_translation(input_file_path, output_folder_path):
    

    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path )

    if os.path.exists(input_file_path):
        # print("Text extraction process starting for...",input_file_path)
        
        file_name = os.path.basename(input_file_path).split('.')[0]
        print('File_name: ', file_name)

        text_file_name = output_folder_path + file_name + ".txt"
        print('tex_file_name: ', text_file_name)

        with open(input_file_path,encoding="utf8") as f:
            txt=f.read()

        doc = spacy_model(txt)
        translated_string = ''
        
        for sentence in doc.sents:
            # print(sentence.text)
            # print(translation_model(sentence.text), '\n')
            # print("End of sentence...", '\n')
            translated_string = translated_string + '\n' + translation_model(sentence.text)[0]['generated_text']
            
            with open(text_file_name, 'w',encoding="utf8") as f:
                f.write(translated_string)
    else:
        raise ValueError("File does not exist!!")

    return text_file_name

In [10]:
def text_translation_wrapper(metadata_file_path, output_folder_path):
    
    metadata = pd.read_csv(metadata_file_path)
    # print(metadata.head())
    print(metadata.columns)

    # # ## When metadata file does not have any column by the name 'doc_text_file_path', we will proceed with all the files 
    if 'translated_text_file_path' not in metadata.columns:
        print("translated_text_file_path not present in metadata...proceeding with all the files")

        metadata.loc[:,'translated_text_file_path'] = metadata["text_file_path"].apply(lambda x: text_translation(x, output_folder_path))
    
    # When metadata file has column by the name 'doc_text_file_path', we with process with only those files for which we dont have value in doc_text_file_path 
    else:
        print("text_file_path present in metadata...proceeding with only those files for which text_file_path is missing")
        metadata_fil = metadata[(metadata['text_file_path']=='') | (metadata['text_file_path'].isnull())]
        metadata_fil.loc[:,'translated_text_file_path'] = metadata_fil["text_file_path"].apply(lambda x: text_translation(x, output_folder_path))
        metadata = pd.concat([metadata[(metadata['translated_text_file_path']!='') & (metadata['translated_text_file_path'].notnull())], metadata_fil], ignore_index=True)

    metadata.to_csv(metadata_file_path, index=False)


## Testing

In [11]:
metadata_file_path = r"/content/drive/MyDrive/Zenon_POC_OCR/metadata.csv"
output_folder_path = "/content/drive/MyDrive/Zenon_POC_OCR/2_output/"
text_translation_wrapper(metadata_file_path, output_folder_path)


Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'input_file_path',
       'text_file_path', 'confidence', 'translated_text_file_path'],
      dtype='object')
text_file_path present in metadata...proceeding with only those files for which text_file_path is missing


In [None]:
# #for matching 1 word
# def match(word,clause):
#     count = 0
#     if '&' in word:
#         words = word.split('&')
#         for j in words: 
#             #print(doc[0].lemma_)
#             if clause.__contains__(lem_word(j)):
#                 count = count + 1
#         flag = 1 if count == len(words) else 0
#     else:
#         flag = 1 if clause.__contains__(lem_word(word)) else 0
#     return flag

# #Matching a keywords list from clause 
# def match_keyword(string,clause):
#     a = string.lower().split(',')
#     keyword = [j.strip() for j in a]
#     key_match = []
#     for i in keyword:
#         key_match.append(match(i,clause))
#     print(key_match)

In [14]:
# a = """Dans le délai de paiement sont incluses les phases de réception de la commande et de certification du 
#                     service fait par le service gestionnaire, de visa de la dépense (appelé aussi le mandatement) et de 
#                     règlement par le comptable public. En cas de dépassement du délai de paiement, des intérêts moratoires 
#                     sont versés au titulaire, calculés par application de la formule suivante : IM = M x J/365 x Taux IM + F ; 
#                     Dans laquelle : IM : montant des intérêts moratoires - M : montant TTC de la demande de paiement - Taux IM :
#                     taux de la Banque Centrale Européenne en vigueur majoré de 8 points - J : nombre de jours calendaires entre 
#                     la date limite et la date réelle de paiement - F : forfait de 40 € de frais de recouvrement"""


In [17]:
# a=a.split(',')

In [None]:
# for j in a:
#     print(j)

In [4]:
# import pandas as pd

In [19]:
# df = pd.read_csv('/content/drive/MyDrive/Zenon_POC_OCR/std and non std classif.csv')

In [None]:
df

In [21]:
# df=df.fillna('')

In [22]:
# df['std/non-std'][1]

'std'

In [28]:
# clause = """Dans le délai de paiement sont incluses les phases de réception de la commande et de certification du 
#                     service fait par le service gestionnaire, de visa de la dépense (appelé aussi le mandatement) et de 
#                     règlement par le comptable public. En cas de dépassement du délai de paiement, des intérêts moratoires 
#                     sont versés au titulaire, calculés par application de la formule suivante : IM = M x J/365 x Taux IM + F ; 
#                     Dans laquelle : IM : montant des intérêts moratoires - M : montant TTC de la demande de paiement - Taux IM :
#                     taux de la Banque Centrale Européenne en vigueur majoré de 8 points - J : nombre de jours calendaires entre 
#                     la date limite et la date réelle de paiement - F : forfait de 40 € de frais de recouvrement"""


In [None]:
# def std/non-std_classifier(path_of_excel_file, clause):
#     std = [clause]
#     df = pd.read_csv(path_of_excel_file)
#     for i in df['list of keywords for std/non-std']:




In [31]:
# std = []
# for i in df['list of keywords for std/non-std']:
#     i = (i.split(','))
#     if len(i) > 1:
#         if i[-1] == 'or':
#             count = 0
#             if clause.__contains__('intérêts moratoires'):
#                  count = count + 1
#                  if count >= 1:
#                      std.append(df['std/non-std'][1])

#         elif i[-1] == 'and':
#             count = 0
#             if clause.__contains__('intérêts moratoires'):
#                  count = count + 1
#                  if count == len(i[:-1]):
#                      std.append(df['std/non-std'][1])


#     #  print((i[-1]))