In [None]:
import pandas as pd 
import numpy as np 
import json
import time
from google.colab import drive
drive.mount('/content/drive')
LIBRARY_PATH = '/content/drive/MyDrive/NLP PROJECT/Finals/'

# Import Google Translate
!pip install googletrans==4.0.0rc1
import googletrans
from googletrans import Translator

In [2]:
data_path = LIBRARY_PATH + 'data/processed/paired_Raw.csv'
df = pd.read_csv(data_path)
data_path = LIBRARY_PATH + 'data/processed/paired_eval.csv'
df_eval = pd.read_csv(data_path)

## Current data languages proportions

In [None]:
# Training Data Set
df.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,274
1,de,de,857
2,de,en,577
3,en,en,1800
4,es,es,570
5,fr,fr,72
6,pl,pl,349
7,tr,tr,465


In [None]:
# Evaluation Data Set
df_eval.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,298
1,de,de,608
2,de,en,185
3,de,fr,116
4,de,pl,35
5,en,en,236
6,es,en,496
7,es,es,243
8,es,it,320
9,fr,fr,111


## Functions

In [None]:
def myTranslate(translator, text, src_lang, dest_lang):  
  try:
    if str(text) == 'NaN' or str(text) == 'nan':
      return text
    return translator.translate(text, dest=dest_lang, src=src_lang).text
  except:
    return text


def translateData(df, src_lang, dest_lang):
  columns_to_translate = ['text1', 'title1', 'meta_keywords1', 'meta_description1', 'text2', 'title2', 'meta_keywords2', 'meta_description2']
  translator = Translator()
  print('Translated: ', end='')
  for col in columns_to_translate:
    df[col] = df[col].apply(lambda x: myTranslate(translator, x, src_lang, dest_lang))
    print(col, end=', ')
  print('')
  return df


def translateOnePair(df, src_lang, dest_lang, first):
  columns_to_translate = ['text1', 'title1', 'meta_keywords1', 'meta_description1'] if first else ['text2', 'title2', 'meta_keywords2', 'meta_description2']
  translator = Translator()
  print('Translated: ', end='')
  for col in columns_to_translate:
    df[col] = df[col].apply(lambda x: myTranslate(translator, x, src_lang, dest_lang))
    print(col, end=', ')
  print('')
  return df


def myDetect(translator, text):
  try:
    return translator.detect(text).lang
  except:
    return 'skipped'

def detectDataLanguage(df):
  columns_to_detect = ['text1', 'title1', 'meta_keywords1', 'meta_description1', 'text2', 'title2', 'meta_keywords2', 'meta_description2']
  translator = Translator()
  languages = dict()
  languages['skipped'] = 0
  print('Detected: ', end='')
  for col in columns_to_detect:
    lang = df[col].apply(lambda x: myDetect(translator, x))
    for i, val in lang.iteritems():
      if val in languages:
        languages[val] += 1
      else:
        languages[val] = 1
    print(col, end=', ')
  print('')
  return languages


def filterUntranslated(df, text1_lang, text2_lang):
  text1_columns = ['text1', 'title1']
  text2_columns = ['text2', 'title2']
  columns_to_detect = text1_columns + text2_columns
  translator = Translator()
  rows_to_remove = set()

  # Find rows that at least have one column with unmatching language
  print('Checked: ', end='')
  for col in columns_to_detect:
    lang = df[col].apply(lambda x: myDetect(translator, x))
    for i, val in lang.iteritems():
      print(col, " => ", val, ", ", i)
      if col in text1_columns and val != text1_lang:
        print('wrong first lang')
        rows_to_remove.add(i)
      elif col in text2_columns and val != text2_lang:
        print('wrong second lang')
        rows_to_remove.add(i)
      else:
        print('did not remove')
    print(col, end=', ')
  print('')

  # Drop those rows 
  print('Removed rows: ',list(rows_to_remove))
  df.drop(list(rows_to_remove), inplace=True)


## Back Translation

e.g fr-fr to en-en, and then back to fr-fr \\

#### Translations

In [None]:
# Augmenting fr-fr sets
start = time.time()
df_fr_fr = df.loc[(df['url1_lang'] == 'fr') & (df['url2_lang'] == 'fr')].copy()
translateData(df_fr_fr, 'fr', 'en')
translateData(df_fr_fr, 'en', 'fr')
df_fr_fr['back_translation'] = 1
df_fr_fr['train_translation'] = 0
df_fr_fr.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_fr_fr.csv', index = False)
print('elapsed time for de-de', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-de 672.8191566467285


In [None]:
detectDataLanguage(df_fr_fr)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'bm': 2,
 'en': 64,
 'es': 1,
 'fr': 292,
 'ht': 30,
 'nl': 1,
 'skipped': 185,
 'sv': 1}

In [None]:
# Augmenting ar-ar sets
start = time.time()
df_ar_ar = df.loc[(df['url1_lang'] == 'ar') & (df['url2_lang'] == 'ar')].copy()
translateData(df_ar_ar, 'ar', 'en')
translateData(df_ar_ar, 'en', 'ar')
df_ar_ar['back_translation'] = 1
df_ar_ar['train_translation'] = 0
df_ar_ar.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_ar_ar.csv', index = False)
print('elapsed time for de-de', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-de 2672.976161956787


In [None]:
detectDataLanguage(df_ar_ar)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'ar': 1956,
 'az': 1,
 'en': 60,
 'es': 1,
 'fa': 1,
 'fr': 1,
 'hi': 1,
 'ht': 127,
 'id': 1,
 'ilo': 1,
 'lb': 1,
 'om': 1,
 'skipped': 38,
 'ur': 2}

In [None]:
# Augmenting pl-pl sets
start = time.time()
df_pl_pl = df.loc[(df['url1_lang'] == 'pl') & (df['url2_lang'] == 'pl')].copy()
translateData(df_pl_pl, 'pl', 'en')
translateData(df_pl_pl, 'en', 'pl')
df_pl_pl['back_translation'] = 1
df_pl_pl['train_translation'] = 0
df_pl_pl.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_pl_pl.csv', index = False)
print('elapsed time for de-de', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-de 2989.758010864258


In [None]:
detectDataLanguage(df_pl_pl)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'da': 1,
 'de': 1,
 'el': 1,
 'en': 621,
 'ht': 180,
 'lb': 1,
 'lt': 1,
 'nl': 1,
 'pl': 1587,
 'skipped': 398}

In [None]:
# Augmenting tr-tr sets
start = time.time()
df_tr_tr = df.loc[(df['url1_lang'] == 'tr') & (df['url2_lang'] == 'tr')].copy()
translateData(df_tr_tr, 'tr', 'en')
translateData(df_tr_tr, 'en', 'tr')
df_tr_tr['back_translation'] = 1
df_tr_tr['train_translation'] = 0
df_tr_tr.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_tr_tr.csv', index = False)
print('elapsed time for tr-tr', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for tr-tr 4838.9230308532715


In [None]:
detectDataLanguage(df_tr_tr)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'az': 1,
 'bn': 1,
 'en': 483,
 'et': 1,
 'ht': 137,
 'ku': 1,
 'mr': 1,
 'pl': 1,
 'pt': 1,
 'skipped': 198,
 'te': 3,
 'tk': 1,
 'tr': 2891}

In [None]:
# Augmenting es-es sets
start = time.time()
df_es_es = df.loc[(df['url1_lang'] == 'es') & (df['url2_lang'] == 'es')].copy()
translateData(df_es_es, 'es', 'en')
translateData(df_es_es, 'en', 'es')
df_es_es['back_translation'] = 1
df_es_es['train_translation'] = 0
df_es_es.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_es_es.csv', index = False)
print('elapsed time for es-es', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for es-es 5857.637191057205


In [None]:
detectDataLanguage(df_es_es)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'bm': 1,
 'ca': 9,
 'ceb': 3,
 'co': 2,
 'en': 570,
 'eo': 1,
 'es': 3404,
 'eu': 2,
 'fi': 1,
 'gl': 5,
 'gn': 16,
 'ht': 297,
 'id': 1,
 'it': 3,
 'pt': 30,
 'qu': 1,
 'ro': 1,
 'skipped': 210,
 'te': 2,
 'zh-CN': 1}

In [None]:
# Augmenting de-de sets
start = time.time()
df_de_de = df.loc[(df['url1_lang'] == 'de') & (df['url2_lang'] == 'de')].copy()
translateData(df_de_de, 'de', 'en')
translateData(df_de_de, 'en', 'de')
df_de_de['back_translation'] = 1
df_de_de['train_translation'] = 0
df_de_de.to_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_de_de.csv', index = False)
print('elapsed time for de-de', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-de 8875.553323745728


In [None]:
detectDataLanguage(df_de_de)

Detected: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 


{'az': 1,
 'bn': 1,
 'da': 1,
 'de': 5166,
 'en': 829,
 'es': 1,
 'et': 1,
 'fi': 1,
 'fr': 1,
 'ht': 184,
 'id': 1,
 'it': 1,
 'lb': 8,
 'nl': 1,
 'pl': 1,
 'pt': 2,
 'rw': 1,
 'skipped': 651,
 'sv': 3,
 'te': 1}

#### Filtering and Appending

In [None]:
df_fr_fr = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_fr_fr.csv')
start = time.time()
filterUntranslated(df_fr_fr, 'fr', 'fr')
print('filter time for fr-fr', time.time() - start)
df_fr_fr.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', index = False)

Checked: text1, title1, text2, title2, 
Removed rows:  [2, 68, 15, 16, 50, 27, 30]
filter time for fr-fr 13.63216495513916


In [None]:
df_ar_ar = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_ar_ar.csv')
start = time.time()
filterUntranslated(df_ar_ar, 'ar', 'ar')
print('filter time for ar-ar', time.time() - start)
df_ar_ar.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [130, 8, 136, 139, 15, 149, 24, 155, 34, 167, 42, 171, 172, 48, 182, 56, 197, 199, 72, 204, 79, 225, 105, 111, 244, 125, 116, 117, 118, 253]
filter time for ar-ar 836.9709875583649


In [None]:
df_pl_pl = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_pl_pl.csv')
start = time.time()
filterUntranslated(df_pl_pl, 'pl', 'pl')
print('filter time for pl-pl', time.time() - start)
df_pl_pl.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [1, 4, 5, 10, 11, 12, 14, 16, 17, 18, 20, 21, 22, 24, 25, 31, 34, 36, 39, 47, 51, 52, 53, 61, 62, 63, 65, 66, 67, 69, 75, 76, 77, 80, 81, 84, 85, 86, 90, 91, 93, 94, 96, 99, 100, 105, 110, 112, 113, 114, 116, 117, 119, 120, 121, 123, 126, 128, 134, 135, 143, 145, 148, 151, 154, 155, 161, 167, 170, 172, 176, 187, 192, 195, 208, 212, 213, 214, 215, 218, 221, 222, 225, 230, 231, 234, 235, 236, 237, 238, 241, 242, 243, 244, 246, 249, 252, 254, 255, 263, 265, 277, 285, 289, 293, 294, 298, 302, 310, 316, 318, 328, 330, 331, 335, 345, 348]
filter time for pl-pl 991.4864027500153


In [None]:
df_tr_tr = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_tr_tr.csv')
start = time.time()
filterUntranslated(df_tr_tr, 'tr', 'tr')
print('filter time for tr-tr', time.time() - start)
df_tr_tr.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [256, 2, 260, 388, 6, 7, 140, 268, 14, 269, 270, 16, 271, 275, 404, 272, 150, 151, 24, 409, 154, 284, 412, 33, 165, 167, 298, 171, 44, 45, 301, 47, 427, 177, 46, 435, 433, 181, 184, 57, 187, 59, 60, 61, 66, 195, 198, 454, 330, 202, 76, 333, 206, 207, 340, 341, 349, 98, 227, 356, 99, 102, 231, 104, 367, 245, 374, 377, 379, 254]
filter time for tr-tr 1596.9450829029083


In [None]:
df_es_es = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_df_es_es.csv')
start = time.time()
filterUntranslated(df_es_es, 'es', 'es')
print('filter time for es-es', time.time() - start)
df_es_es.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [259, 260, 135, 9, 138, 266, 12, 140, 142, 143, 398, 17, 528, 531, 535, 281, 153, 412, 156, 286, 543, 291, 548, 42, 301, 46, 47, 51, 437, 441, 394, 68, 196, 453, 454, 73, 75, 76, 460, 206, 207, 213, 214, 475, 479, 361, 106, 490, 494, 367, 242, 116, 117, 505, 122]
filter time for es-es 1794.7882816791534


In [None]:
df_aug = pd.read_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv')
df_aug.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,244
1,es,es,515
2,fr,fr,65
3,pl,pl,232
4,tr,tr,395


## Translation Train

e.g en-en to zh-zh \\
e.g en-en to zh-en

#### Translation

In [None]:
# Generating de-fr sets
start = time.time()
df_de_fr = df.loc[(df['url1_lang'] == 'de') & (df['url2_lang'] == 'en')].copy()
translateOnePair(df_de_fr, 'en', 'fr', False)
df_de_fr['back_translation'] = 0
df_de_fr['train_translation'] = 1
df_de_fr['url2_lang'] = 'fr'
df_de_fr.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_de_fr.csv', index = False)
print('elapsed time for de-fr', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-fr 1246.6776025295258


In [None]:
# Generating de-pl sets
start = time.time()
df_de_pl = df.loc[(df['url1_lang'] == 'de') & (df['url2_lang'] == 'en')].copy()
translateOnePair(df_de_pl, 'en', 'pl', False)
df_de_pl['back_translation'] = 0
df_de_pl['train_translation'] = 2
df_de_pl['url2_lang'] = 'pl'
df_de_pl.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_de_pl.csv', index = False)
print('elapsed time for de-pl', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for de-pl 1361.7799665927887


In [None]:
# Generating es-en sets
start = time.time()
df_es_en = df.loc[(df['url1_lang'] == 'es') & (df['url2_lang'] == 'es')].copy()
translateOnePair(df_es_en, 'es', 'en', False)
df_es_en['back_translation'] = 0
df_es_en['train_translation'] = 1
df_es_en['url2_lang'] = 'en'
df_es_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_en.csv', index = False)
print('elapsed time for es-en', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for es-en 1446.1608302593231


In [None]:
# Generating es-it sets
start = time.time()
df_es_it = df.loc[(df['url1_lang'] == 'es') & (df['url2_lang'] == 'es')].copy()
translateOnePair(df_es_it, 'es', 'it', False)
df_es_it['back_translation'] = 0
df_es_it['train_translation'] = 2
df_es_it['url2_lang'] = 'it'
df_es_it.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_it.csv', index = False)
print('elapsed time for es-it', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for es-it 2528.6507942676544


In [None]:
# Generating it-it sets
start = time.time()
df_it_it = df_es_it.copy()
translateOnePair(df_it_it, 'es', 'it', True)
df_it_it['back_translation'] = 0
df_it_it['train_translation'] = 3
df_it_it['url1_lang'] = 'it'
df_it_it.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_it_it.csv', index = False)
print('elapsed time for it-it', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, 
elapsed time for it-it 2610.129888534546


In [None]:
# Generating fr-pl sets
start = time.time()
df_fr_pl = df.loc[(df['url1_lang'] == 'fr') & (df['url2_lang'] == 'fr')].copy()
translateOnePair(df_fr_pl, 'fr', 'pl', False)
df_fr_pl['back_translation'] = 0
df_fr_pl['train_translation'] = 1
df_fr_pl['url2_lang'] = 'pl'
df_fr_pl.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_fr_pl.csv', index = False)
print('elapsed time for fr-pl', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for fr-pl 345.8586483001709


In [None]:
# Generating pl-en sets
start = time.time()
df_pl_en = df.loc[(df['url1_lang'] == 'pl') & (df['url2_lang'] == 'pl')].copy()
translateOnePair(df_pl_en, 'pl', 'en', False)
df_pl_en['back_translation'] = 0
df_pl_en['train_translation'] = 1
df_pl_en['url2_lang'] = 'en'
df_pl_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_pl_en.csv', index = False)
print('elapsed time for pl-en', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for pl-en 748.289648771286


In [None]:
df_en_en = df.loc[(df['url1_lang'] == 'en') & (df['url2_lang'] == 'en')].copy()

In [None]:
# Generating ru-ru sets
start = time.time()
df_ru_ru = df_en_en.iloc[:550].copy()
translateData(df_ru_ru, 'en', 'ru')
df_ru_ru['back_translation'] = 0
df_ru_ru['train_translation'] = 1
df_ru_ru['url1_lang'] = 'ru'
df_ru_ru['url2_lang'] = 'ru'
df_ru_ru.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_ru_ru.csv', index = False)
print('elapsed time for ru-ru', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, text2, title2, meta_keywords2, meta_description2, 
elapsed time for ru-ru 2758.0467212200165


In [None]:
# Generating zh-en sets
start = time.time()
df_zh_en = df_en_en.iloc[550:1100].copy()
translateOnePair(df_zh_en, 'en', 'zh-cn', True)
df_zh_en['back_translation'] = 0
df_zh_en['train_translation'] = 2
df_zh_en['url1_lang'] = 'zh'
df_zh_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_en.csv', index = False)
print('elapsed time for zh-en', time.time() - start)

Translated: text1, title1, meta_keywords1, meta_description1, 
elapsed time for zh-en 1417.3064422607422


In [None]:
# Generating zh-zh sets
start = time.time()
df_zh_zh = df_zh_en.copy()
translateOnePair(df_zh_zh, 'en', 'zh-cn', False)
df_zh_zh['back_translation'] = 0
df_zh_zh['train_translation'] = 3
df_zh_zh['url2_lang'] = 'zh'
df_zh_zh.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_zh.csv', index = False)
print('elapsed time for zh-zh', time.time() - start)

Translated: text2, title2, meta_keywords2, meta_description2, 
elapsed time for zh-zh 1365.1255292892456


#### Filtering and appending

In [None]:
df_de_fr = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_de_fr.csv')
start = time.time()
filterUntranslated(df_de_fr, 'de', 'fr')
print('filter time for de-fr', time.time() - start)
df_de_fr.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', index = False)

Checked: text1, title1, text2, title2, 
Removed rows:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 139, 140, 144, 145, 146, 147, 154, 156, 158, 159, 161, 166, 169, 174, 179, 180, 183, 186, 188, 189, 191, 192, 193, 194, 195, 196, 197, 203, 207, 209, 210, 211, 212, 215, 216, 218, 222, 227, 229, 232, 233, 234, 236, 238, 239, 241, 245, 250, 252, 260, 261, 262, 263, 266, 267, 269, 272, 273, 274, 278, 282, 285, 286, 287, 288, 289, 291, 294, 299, 301, 302, 304, 306, 307,

In [None]:
df_de_pl = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_de_pl.csv')
start = time.time()
filterUntranslated(df_de_pl, 'de', 'pl')
print('filter time for de-pl', time.time() - start)
df_de_pl.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,

In [None]:
df_es_en = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_en.csv')
start = time.time()
filterUntranslated(df_es_en, 'es', 'en')
print('filter time for es-en', time.time() - start)
df_es_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [512, 516, 9, 12, 15, 17, 531, 534, 535, 543, 548, 556, 46, 47, 51, 565, 68, 69, 73, 75, 76, 106, 107, 116, 117, 122, 125, 135, 138, 140, 143, 153, 155, 156, 158, 159, 160, 169, 176, 178, 183, 184, 187, 194, 196, 198, 199, 200, 201, 202, 203, 206, 207, 208, 209, 210, 211, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 34

In [None]:
df_es_it = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_it.csv')
start = time.time()
filterUntranslated(df_es_it, 'es', 'it')
print('filter time for es-it', time.time() - start)
df_es_it.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

In [None]:
df_it_it = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_it_it.csv')
start = time.time()
filterUntranslated(df_it_it, 'it', 'it')
print('filter time for it-it', time.time() - start)
df_it_it.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [9, 12, 17, 531, 535, 23, 543, 548, 39, 42, 44, 46, 47, 51, 62, 64, 65, 67, 68, 69, 73, 75, 76, 84, 91, 95, 106, 116, 117, 122, 125, 135, 138, 140, 142, 143, 153, 156, 160, 176, 178, 185, 187, 196, 206, 213, 214, 217, 242, 244, 245, 259, 260, 272, 277, 278, 279, 281, 283, 291, 301, 317, 334, 338, 342, 361, 364, 367, 372, 393, 394, 398, 412, 418, 436, 437, 441, 460, 481, 490, 494, 505]
filter time for it-it 1442.3392827510834


In [None]:
df_fr_pl = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_fr_pl.csv')
start = time.time()
filterUntranslated(df_fr_pl, 'fr', 'pl')
print('filter time for fr-pl', time.time() - start)
df_fr_pl.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [2, 68, 4, 5, 15, 16, 50, 25, 27, 30]
filter time for fr-pl 238.0404486656189


In [None]:
df_pl_en = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_pl_en.csv')
start = time.time()
filterUntranslated(df_pl_en, 'pl', 'en')
print('filter time for pl-en', time.time() - start)
df_pl_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [1, 5, 10, 11, 12, 17, 18, 20, 21, 22, 23, 24, 25, 26, 31, 34, 36, 39, 47, 51, 53, 60, 61, 62, 63, 64, 66, 67, 69, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 240, 241, 242, 243, 244

In [None]:
df_ru_ru = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_ru_ru.csv')
start = time.time()
filterUntranslated(df_ru_ru, 'ru', 'ru')
print('filter time for ru-ru', time.time() - start)
df_ru_ru.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

Checked: text1, title1, text2, title2, 
Removed rows:  [512, 520, 522, 523, 14, 527, 528, 529, 530, 531, 19, 22, 24, 541, 543, 32, 545, 31, 41, 42, 43, 46, 47, 48, 50, 52, 53, 59, 62, 63, 65, 67, 70, 72, 73, 76, 77, 78, 83, 90, 91, 96, 101, 103, 104, 105, 106, 107, 110, 112, 114, 117, 118, 121, 122, 126, 130, 131, 135, 138, 142, 151, 153, 154, 155, 162, 170, 178, 180, 181, 185, 189, 190, 191, 195, 196, 201, 205, 209, 210, 217, 218, 223, 229, 231, 232, 234, 235, 245, 247, 250, 251, 256, 259, 266, 269, 279, 280, 285, 288, 293, 294, 299, 300, 303, 305, 309, 310, 311, 318, 320, 322, 324, 328, 329, 333, 335, 336, 340, 342, 347, 352, 357, 363, 364, 368, 376, 379, 386, 392, 395, 396, 398, 401, 405, 406, 407, 409, 411, 414, 416, 419, 420, 425, 431, 434, 435, 436, 441, 442, 443, 448, 455, 457, 458, 460, 461, 462, 467, 469, 472, 474, 478, 480, 481, 491, 495, 497, 498, 503, 507, 509]
filter time for ru-ru 1765.4183378219604


In [None]:
df_zh_en = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_en.csv')
start = time.time()
filterUntranslated(df_zh_en, 'zh-CN', 'en')
print('filter time for zh-en', time.time() - start)
df_zh_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

In [None]:
df_zh_zh = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_zh.csv')
start = time.time()
filterUntranslated(df_zh_zh, 'zh-CN', 'zh-CN')
print('filter time for zh-en', time.time() - start)
df_zh_zh.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv', mode='a', index=False, header=False) # Append on existing file

In [None]:
df_gen = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv')
df_gen.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,de,fr,103
1,de,pl,73
2,es,en,287
3,es,it,81
4,fr,pl,62
5,it,it,488
6,pl,en,76
7,ru,ru,378
8,zh,en,76
9,zh,zh,43


## Appending both sets

In [None]:
df_aug = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/aug_paired_Raw.csv')
df_gen = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_paired_Raw.csv')
df_augmented = pd.concat([df_aug, df_gen], ignore_index=True)

In [None]:
df_aug.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,244
1,es,es,515
2,fr,fr,65
3,pl,pl,232
4,tr,tr,395


In [None]:
df_gen.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,de,fr,103
1,de,pl,73
2,es,en,287
3,es,it,81
4,fr,pl,62
5,it,it,488
6,pl,en,76
7,ru,ru,378
8,zh,en,76
9,zh,zh,43


In [None]:
df_augmented.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,244
1,de,fr,103
2,de,pl,73
3,es,en,287
4,es,es,515
5,es,it,81
6,fr,fr,65
7,fr,pl,62
8,it,it,488
9,pl,en,76


In [None]:
df_augmented.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv', index = False)

## Additional translations

In [None]:
# Generating zh-en sets
start = time.time()
df_zh_en = df.loc[(df['url1_lang'] == 'en') & (df['url2_lang'] == 'en')].copy()
translateOnePair(df_zh_en, 'en', 'zh-cn', True)
filterUntranslated(df_zh_en, 'zh-CN', 'en')
df_zh_en['back_translation'] = 0
df_zh_en['train_translation'] = 2
df_zh_en['url1_lang'] = 'zh'
df_zh_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_en_all.csv', index = False)
print('elapsed time for zh-en', time.time() - start)

In [None]:
# Generating zh-zh sets
start = time.time()
df_zh_zh = df_zh_en.copy()
translateOnePair(df_zh_zh, 'en', 'zh-cn', False)
filterUntranslated(df_zh_zh, 'zh-CN', 'zh-CN')
df_zh_zh['back_translation'] = 0
df_zh_zh['train_translation'] = 3
df_zh_zh['url2_lang'] = 'zh'
df_zh_zh.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_zh_all.csv', index = False)
print('elapsed time for zh-zh', time.time() - start)

In [None]:
# Generating es-en
start = time.time()
df_es_en = df.loc[(df['url1_lang'] == 'en') & (df['url2_lang'] == 'en')].copy()
translateOnePair(df_es_en, 'en', 'es', True)
filterUntranslated(df_es_en, 'es', 'en')
df_es_en['back_translation'] = 0
df_es_en['train_translation'] = 1
df_es_en['url1_lang'] = 'es'
df_es_en.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_en_all.csv', index = False)
print('elapsed time for es-en', time.time() - start)

In [None]:
# Generating es-it
start = time.time()
df_es_it = df_es_en.copy()
translateOnePair(df_es_it, 'en', 'it', False)
filterUntranslated(df_es_it, 'es', 'it')
df_es_it['back_translation'] = 0
df_es_it['train_translation'] = 2
df_es_it['url2_lang'] = 'it'
df_es_it.to_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_it_all.csv', index = False)
print('elapsed time for es-it', time.time() - start)

In [None]:
gen_df_zh_en_all = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_en_all.csv')
gen_df_zh_en_all.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,zh,en,632


In [None]:
gen_df_zh_zh_all = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_zh_zh_all.csv')
gen_df_zh_zh_all.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,zh,zh,192


In [None]:
gen_df_es_en_all = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_en_all.csv')
gen_df_es_en_all.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,es,en,509


In [None]:
gen_df_es_it_all = pd.read_csv(LIBRARY_PATH + 'data/processed/temporal/gen_df_es_it_all.csv')
gen_df_es_it_all.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,es,it,197


Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,244
1,de,fr,103
2,de,pl,73
3,es,en,287
4,es,es,515
5,es,it,81
6,fr,fr,65
7,fr,pl,62
8,it,it,488
9,pl,en,76


In [None]:
df_augmented_2 = pd.read_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw.csv')
df_augmented_2.drop(df_augmented_2[(df_augmented_2['url1_lang'] == 'zh') & (df_augmented_2['url2_lang'] == 'en')].index, inplace=True)
df_augmented_2.drop(df_augmented_2[(df_augmented_2['url1_lang'] == 'zh') & (df_augmented_2['url2_lang'] == 'zh')].index, inplace=True)
df_augmented_2 = pd.concat([df_augmented_2, gen_df_zh_en_all, gen_df_zh_zh_all, gen_df_es_en_all, gen_df_es_it_all], ignore_index=True)
df_augmented_2.to_csv(LIBRARY_PATH + 'data/processed/augmented_paired_Raw_2.csv', index = False)
df_augmented_2.groupby(["url1_lang", "url2_lang"]).size().reset_index()

Unnamed: 0,url1_lang,url2_lang,0
0,ar,ar,244
1,de,fr,103
2,de,pl,73
3,es,en,796
4,es,es,515
5,es,it,278
6,fr,fr,65
7,fr,pl,62
8,it,it,488
9,pl,en,76
