## ML Commons Data Source

In [1]:
import os
import pandas as pd
import glob
import numpy as np
import tqdm

In [2]:
current_directory = os.getcwd()
os.chdir(current_directory + '/language_source')
lang_list = glob.glob('*.csv')

In [None]:
word_corpus = pd.DataFrame()

for lang in tqdm.tqdm(lang_list):
    temp_df = pd.read_csv(lang)
    temp_word_list = pd.DataFrame(temp_df['WORD'].unique())
    temp_word_list.rename(columns={0:lang}, inplace=True)
    word_corpus = pd.concat([word_corpus, temp_word_list], axis=1)

word_corpus.head()

#### Rename Columns 

In [4]:
os.chdir(current_directory)

In [5]:
lang_list_df = pd.DataFrame(np.array(lang_list))
lang_list_df.rename(columns={0:'raw_input'}, inplace=True)


lang_codes = []
for lang in lang_list:
    x = lang.split('_')
    lang_codes.append(x[0])

lang_codes_df = pd.DataFrame(np.array(lang_codes))
lang_codes_df.rename(columns={0:'lang_code'}, inplace=True)

lang_merged = pd.concat([lang_list_df,lang_codes_df], axis=1)

In [6]:
full_langs = pd.read_excel('Language_mappings.xlsx', header=None )
full_langs.rename(columns={0:'language', 1:'lang_code'}, inplace=True)
full_langs.drop_duplicates(subset='lang_code', keep='first', inplace=True)

In [7]:
#Merge two DataFrames
lang_name_list = lang_merged.merge(full_langs, how='left', on='lang_code')
col_names = lang_name_list['language'].to_list()

In [8]:
#Rename word corpus columns
word_corpus.columns = col_names
word_corpus.to_csv('word_corpus.csv', encoding='utf_8_sig')

#### Using M2M100 Hugging Face model to map words in different language to english

In [None]:
df = pd.read_csv('word_corpus.csv')
m2m_language_mappings = pd.read_excel('m2m mapping.xlsx')
mlc_language_codes = pd.read_excel('language_code_mapping.xlsx')

df.replace(np.nan,'',regex=True, inplace=True)
df.drop(columns=['English'], inplace=True)

In [3]:
# Convert words from non-english language to english language

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
for column in df.columns:
    trans_words = {}
    word_list = []
    if mlc_language_codes['Language'].str.contains(column).sum()>0:
        lang_code = mlc_language_codes.loc[mlc_language_codes[mlc_language_codes['Language']==column].index[0], 'lang_code']
        if m2m_language_mappings['lang_code'].str.contains(lang_code).sum()>0:
            for word in tqdm(df[column]):
                if word != '':
                    word = word
                    tokenizer.src_lang = lang_code
                    encoded_src_lang = tokenizer(word, return_tensors="pt")
                    generated_tokens = model.generate(**encoded_src_lang, forced_bos_token_id=tokenizer.get_lang_id("en"))
                    word_list.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    trans_words[column] = word_list
    converted_words = pd.DataFrame(trans_words)
    converted_words.to_csv(column)
    print(column)

In [None]:
df = df.apply(lambda x: pd.Series(x.dropna().values)).fillna('')
df.to_csv("nan_shifted_up_word_corpus.csv")

In [4]:
# Merge translated files with original

In [None]:
# import word corpus
df = pd.read_csv('nan_shifted_up_word_corpus.csv')

#drop additional index column
df = df.iloc[:,1:]

In [None]:
os.chdir(current_directory + '/lang_translated_files')

translated_files = glob.glob('*')

#Create dataframe of english words

df_english_words = pd.DataFrame()

for file in translated_files:
    temp_df = pd.read_csv(file, names=['index', 'english_words'])
    temp_df = temp_df.iloc[1:,1:]
    df_english_words = pd.concat([df_english_words,temp_df], ignore_index=True)

In [None]:
#remove duplicates from the data
df_english_words = df_english_words.drop_duplicates()
df_english_words.dropna(inplace=True)
df_english_words.reset_index(drop=True, inplace=True)

In [None]:
#create merged dataframe of original and english translated words

d = {}

for file in translated_files:
    temp_df = df[[file]]
    eng_df = pd.read_csv(file)
    eng_df = eng_df.rename(columns={file:file+'_eng'})
    d[file] = pd.concat([eng_df,temp_df],axis=1)

In [None]:
translated_words_df = pd.DataFrame()

for file in tqdm(translated_files):
    temp_df = df_english_words.merge(d[file], how='left', left_on = 'english_words', right_on = (file + '_eng'))
    temp_df = temp_df.groupby(['english_words'])[file].apply(lambda x: '/'.join(x.astype(str))).reset_index()
    temp_df = temp_df.iloc[:,1:]
    translated_words_df = pd.concat([translated_words_df,temp_df], axis=1)

os.chdir(current_directory)
translated_words_df.replace('nan',"", inplace=True)
translated_words_df.to_csv('mlcommons_word_corpus_final.csv')