## Import Libraries

In [26]:
import json
import pandas as pd
import numpy as np
import os

In [27]:
from tqdm import tqdm
tqdm.pandas()

### function to find count of duplicate keys present

In [28]:
def chk_duplicates(path):
    duplicate = []  
    
    folders = os.listdir(path)
    
    for folder in folders:
        new_path = os.path.join(path, folder)
        files_in_folder = os.listdir(new_path)
        
        for index, file in enumerate(files_in_folder):
            
            file_path = os.path.join(new_path, file)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
            dupli = [value for k,v in data.items() for key, value in v.items()]
            duplicate.extend(dupli)
    count={}
    for word in duplicate:
        if word not in count:
            count[word]=duplicate.count(word)
    count={key:v for key, v in sorted(count.items(), key=lambda x: (x[1], x[0]), reverse=True)}
    
    return count

In [29]:
path = r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\bhashini_integration\Json_tocsv\json_files"
ans=chk_duplicates(path)
with open("duplicate_words_count.json", "w") as f:
    
    json.dump(ans, f)

## Function for traversing through folder-files and creating dataframe and then generating csv out of it

### Added functionality to remove duplicate keys

In [30]:
def create_dataframe_from_json(path):
    dfs = []  
    
    folders = os.listdir(path)
    
#     for folder in folders:
#         new_path = os.path.join(path, folder)
#         files_in_folder = os.listdir(new_path)
        
    for index, file in enumerate(files_in_folder):
        file_name = file.split(".")
        file_path = os.path.join(new_path, file)

        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        dict_json = {f"{file_name[0]}.{k}.{key}": string for k, v in data.items() for key, string in v.items()}

        df = pd.DataFrame.from_dict(dict_json, orient='index').reset_index()
        df.rename(columns={"index": "languagekey", 0: "en_value (current)"}, inplace=True)

        additional_columns = ["hi_translated", "hi_transliterated", "hi_value(curated)",
                              "ta_translated", "ta_transliterated", "ta_value (curated)"]

        df[additional_columns] = pd.DataFrame([[pd.NA] * len(additional_columns)], index=df.index)
        dfs.append(df)
    
    
    result_df = pd.concat(dfs, ignore_index=True)
    result_df=result_df.groupby('en_value (current)').first()
    result_df.reset_index(inplace=True)
    result_df = result_df.reindex(columns=['languagekey', 'en_value (current)', 'hi_translated', 'hi_transliterated', 'hi_value(curated)', 'ta_translated', 'ta_transliterated', 'ta_value(curated)'])
    result_df.to_csv("Sample.csv", index=False)
    
#     return result_df
    return None

In [32]:
path = r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\bhashini_integration\Json_tocsv\json_files"
result_df = create_dataframe_from_json(path)
# print(result_df.head())

# Bhashini Integration

## Get active api keys

In [33]:
import requests


In [34]:
userID='3b9427542f494332b6431cb28dd1ddb1'
ulcaApiKey='52194d3e6f-1d2c-4247-a665-bce41ef2eb4c'

In [204]:
def getactive_api(taskType, userID, ulcaApiKey):
        
    url = "https://meity-auth.ulcacontrib.org/ulca/apis/v0/model/getModelsPipeline"

    payload = json.dumps({
      "pipelineTasks": [
        {
          "taskType": taskType,
          "config": {
            "language": {
              "sourceLanguage": "en"
            }
          }
        }
      ],
      "pipelineRequestConfig": {
        "pipelineId": "64392f96daac500b55c543cd"
      }
    })
    headers = {
      'userID': userID,
      'ulcaApiKey': ulcaApiKey,
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)


    config_translation=response.text
    config_translation_data = json.loads(config_translation)
#     print(config_translation_data)
    target_languages = config_translation_data['languages'][0]['targetLanguageList']
    api_translation = config_translation_data['pipelineResponseConfig'][0]['config'][0]['serviceId']
#     target_lang=str(input(f"select language from this list {target_languages}: "))

    return api_translation
    

## Store active api for translation and transliteration in active _api list

In [206]:
active_api=[]

In [207]:
active_api.append(getactive_api("transliteration", userID, ulcaApiKey))

In [208]:
active_api.append(getactive_api("translation", userID, ulcaApiKey))

In [209]:
active_api

['ai4bharat/indicxlit--cpu-fsv2', 'ai4bharat/indictrans-v2-all-gpu--t4']

## Function to get translation and transliteration

In [40]:
def bhashini_api_call(task, target_lang, active_api, string):
    if task=="translation":
        api=active_api[1]
    else:
        api= active_api[0]


    url = "https://dhruva-api.bhashini.gov.in/services/inference/pipeline"
    # source="Hi my name is Palash"
    payload = json.dumps({
      "pipelineTasks": [
        {
          "taskType": task,
          "config": {
            "language": {
              "sourceLanguage": "en",
              "targetLanguage": target_lang
            },
            "serviceId": api
          }
        }
      ],
      "inputData": {
        "input": [
          {
            "source": string
    #         "source": "मेरा नाम विहिर है और मैं भाषाावर्ष यूज कर रहा हूँ"
          }
        ]
      }
    })
    headers = {
      'Accept': '*/*',
      'Authorization': '9uAUqhCxaept0FGxeOUkyJ1XQSZtp9GWHy5XLriwyBsS-sovl9RkTe2Gkthwrx2F',
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    translation_json= response.text

    translated_data = json.loads(translation_json)
#     print(api)
    if task=="translation":
        return translated_data['pipelineResponse'][0]['output'][0]['target']
    else:
        return translated_data['pipelineResponse'][0]['output'][0]['target'][0]

## Load CSV

In [41]:
df=pd.read_csv("Sample.csv")

In [42]:
# for index,row in df.iterrows():

#     row["hi_translated"]=bhashini_api_call("translation", "hi", active_api, row["en_value (current)"])

    
#     break

In [44]:
df["hi_translated"] = df["en_value (current)"].progress_apply(lambda x: bhashini_api_call("translation", "hi", active_api, x))


100%|████████████████████████████████████████████████████████████████████████████████| 861/861 [06:42<00:00,  2.14it/s]


In [45]:
df.sample(5)

Unnamed: 0,languagekey,en_value (current),hi_translated,hi_transliterated,hi_value(curated),ta_translated,ta_transliterated,ta_value(curated)
539,en.userProfile.professionalDetails,Professional Details,पेशेवर विवरण,,,,,
301,en.publicsignup.group,Group,समूह,,,,,
839,en.termsandcondition.p41,https://karmayogibharat.gov.in/,https://karmayogibharat.gov.in,,,,,
129,en.publicHome.competencyHub,Competency hub,योग्यता केंद्र,,,,,
582,en.profilehome.roleAndActivities,Role & Activities,भूमिका और गतिविधियाँ,,,,,


In [46]:
df["ta_translated"] = df["en_value (current)"].progress_apply(lambda x: bhashini_api_call("translation", "ta", active_api, x))


100%|████████████████████████████████████████████████████████████████████████████████| 861/861 [06:27<00:00,  2.22it/s]


In [48]:
df["hi_transliterated"] = df["en_value (current)"].progress_apply(lambda x: bhashini_api_call("transliteration", "hi", active_api, x))

100%|████████████████████████████████████████████████████████████████████████████████| 861/861 [10:07<00:00,  1.42it/s]


In [200]:
df["ta_transliterated"] = df["en_value (current)"].progress_apply(lambda x: bhashini_api_call("transliteration", "ta", active_api, x))

100%|████████████████████████████████████████████████████████████████████████████████| 861/861 [11:18<00:00,  1.27it/s]


In [49]:
df.to_csv("Updated_06_12.csv")

In [None]:
# df=pd.read_csv("Updated_06_12.csv")

In [101]:

#     df.head()

Unnamed: 0.1,Unnamed: 0,languagekey,en_value (current),hi_translated,hi_transliterated,hi_value(curated),ta_translated,ta_transliterated,ta_value (curated)
0,0,en.common.karmayogiBharat,Karmayogi Bharat,कर्मयोगी भारत,कर्मयोगी भारत,,கர்மயோகி பாரத்,கர்மயோகி பாரத்,
1,1,en.common.hubs,Hubs,हब,हब्स,,மையங்கள்,ஹப்ஸ்,
2,2,en.common.learn,Learn,सीखें।,लर्न,,கற்றுக் கொள்ளுங்கள்.,லர்ன்,
3,3,en.common.discuss,Discuss,चर्चा करें।,डिस्कस,,விவாதிக்கவும்.,டிஸ்கஸ்,
4,4,en.common.network,Network,नेटवर्क,नेटवर्क,,நெட்வொர்க்,நெட்வொர்க்,


In [None]:
path = r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\bhashini_integration\Json_tocsv\en.json"

with open(path, 'r', encoding='utf-8') as file:
    json_data = file.read()
    
data = json.loads(json_data)

In [198]:
def create_Json(u_in, csv_path, files_path):
    
    files_in_folder = os.listdir(files_path)
       
    for index, file in enumerate(files_in_folder):
        final_dict={}
        file_name = file.split(".")
        file_name=file_name[0]
        file_path = os.path.join(files_path, file)

        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    
        for tag,value in data.items():
            temp_dict={}
            for keyword, v in value.items():
                df=pd.read_csv(csv_path)
                
                try:
                    
                    value_df=df[u_in][df["en_value (current)"]==v].values[0]

                    temp_dict[keyword]=value_df
                    
                except:
                    print(keyword)


            final_dict[tag]=temp_dict

        
        with open(f"output_{file_name}", 'w', encoding='utf-8') as json_file:
            json.dump(final_dict, json_file, indent=2, ensure_ascii=False)
            
    return None

In [199]:
create_Json("hi_translated", r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\bhashini_integration\Json_tocsv\Updated_06_12.csv", r"C:\Users\Palash Ashok Bhosale\Jupy\Projects\bhashini_integration\Json_tocsv\data")