In [None]:
# !pip install google_trans_new
!pip install googletrans==3.1.0a0

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from googletrans import Translator
# from google_trans_new import google_translator 
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
train_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

train_df.head()

In [None]:
train_df_sp_path = '../input/train-val-split/train_df_sp.csv'
validation_df_sp_path = '../input/train-val-split/validation_df_sp.csv'

if train_df_sp_path and validation_df_sp_path:
    # Use the same train-val split to back translate input training data
    print ("Loading training and validation split csv files...")
    train_df_sp = pd.read_csv(train_df_sp_path)
    validation_df_sp = pd.read_csv(validation_df_sp_path)
else:
    # Create train-val split data and save as csv file
    print ("Creating training and validation split csv files...")
    # Stratify ensures that each sub-set contains approximately the same percentage of samples of each target class as the original set.
    train_df_sp, validation_df_sp = train_test_split(train_df, stratify=train_df.label.values, 
                                                      random_state=42, 
                                                      test_size=0.2, shuffle=True)


    train_df_sp.reset_index(drop=True, inplace=True)
    validation_df_sp.reset_index(drop=True, inplace=True)

    train_df_sp.to_csv('train_df_sp.csv', index=False)
    validation_df_sp.to_csv('validation_df_sp.csv', index=False)
    
    
# check the number of rows and columns in the subsets after split
print("Training data shape after split: {}".format(train_df_sp.shape))
print("Validation data shape after split: {}".format(validation_df_sp.shape))

In [None]:
def back_translate(train_df, target_lang='fr'):
    
    print ("Back translating training input for target language: {}".format(target_lang))
        
    train_bt = train_df.copy()
    df_list = []
    limit_before_timeout = 100
    timeout = 5
    
    translator = Translator() 
    # Add functions to back translate input sentences
    def target_translate(x, target_lang):
        translation = translator.translate(x, dest=target_lang)
        return translation.text
    def source_translate(x, source_lang):
        translation = translator.translate(x, dest=source_lang) 
        return translation.text 
    
    for i in tqdm(range(len(train_bt))):
        entry = train_bt.loc[[i]]
        source_lang = entry.lang_abv.values.tolist()[0]
        if source_lang == 'zh':
            #print(googletrans.LANGUAGES) 
            source_lang = 'zh-cn' #'zh' not in googletrans.LANGUAGES
        
        if (i!=0) and (i%limit_before_timeout == 0): #apply timeout after every 100 iterations 
            print('Iteration {} of {}'.format(i, len(train_bt)))
            time.sleep(timeout)
        
        # Back translate premise sentence
        entry['premise'] = entry['premise'].apply(lambda x: target_translate(x, target_lang))
#         time.sleep(0.2)
        entry['premise'] = entry['premise'].apply(lambda x: source_translate(x, source_lang))
#         time.sleep(0.2)
        
        # Back translate hypothesis sentence
        entry['hypothesis'] = entry['hypothesis'].apply(lambda x: target_translate(x, target_lang))
#         time.sleep(0.2)
        entry['hypothesis'] = entry['hypothesis'].apply(lambda x: source_translate(x, source_lang))
#         time.sleep(0.2)
    
        df_list.append(entry)
    
    train_bt = pd.concat(df_list, ignore_index=True)
    return train_bt

In [None]:
%%time
lang_abvs = train_df_sp['lang_abv'].unique().tolist()
lang_abvs = ['zh-cn' if lang == 'zh' else lang for lang in lang_abvs]
for lang in lang_abvs:
    #sample input training data to back translate
    train_sampled = (train_df_sp.groupby('language', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000)))).sample(frac = 1).reset_index(drop=True)
    back_trans_df = back_translate(train_sampled, target_lang=lang)
    print("Shape of input data with back translation for {}: {}".format(lang, back_trans_df.shape))
    back_trans_df.to_csv('back_translations_{}.csv'.format(lang), index=False)