# 1 Introduction

## 1.0 Package imports

In [3]:
import numpy as np
import pandas as pd
from googletrans import Translator
import time
import warnings
from pandas_profiling import ProfileReport

# 2 Definitions

## 2.0 Parameter definitions

In [4]:
data_location = '../data/indonesia/groundtruth-indonesia.csv'
data_location_en = '../data/indonesia/groundtruth-indonesia-en.csv'

## 2.1 Function definitions

In [34]:
translator = Translator()

def translate_worker(string: str, translator: 'Translator' = translator) -> str:
    if string == 'nan' or pd.isna(string):
        return string
    
    language = translator.detect(string)
    if language.lang != 'en':
        time.sleep(3)
        string_trans = translator.translate(string)
        if string_trans.src in ['id', 'ms', 'jw', 'su', 'gu']:
        #assert (string.src in ['id', 'ms', 'jw', 'su', 'gu']), f'Incorrect input language of {string.src}'
            string = string_trans.text
        else:
            warnings.warn(f'Incorrect language of {string_trans.src}')
    return string
    

# 3 Execution

## 3.1 Translate Indonesian to English

In [4]:
data = pd.read_csv(data_location)
cols_to_drop = [x for x in data.columns if 'Unnamed' in x]
data = data.drop(cols_to_drop, axis = 1)
data.columns = ['Publication date', 'Month', 'Year', 'Headline',
                'Media', 'Tone', 'Province',  'Disputing Parties',
                'Issue', 'Similar', 'Summary']

In [6]:
party_one = []
party_two = []
for row in range(0, len(data)):
    src = data['Disputing Parties'][row]
    if not pd.isna(src):
        if '\n' in src:
            parties = src.split('\n')
        elif '2' in src:
            parties = src.split('2')
        if len(parties) == 2:
            party_one.append(parties[0].replace("1. ", ""))
            party_two.append(parties[1].replace("2. ", ""))
        else:
            party_one.append(parties[0].replace("1. ", ""))
            party_two.append(np.nan)
    else:
        party_one.append(np.nan)
        party_two.append(np.nan)

data['Party One'] = party_one
data['Party Two'] = party_two

data = data.drop(['Disputing Parties'], axis = 1)
data.head(3)

Unnamed: 0,Publication date,Month,Year,Headline,Media,Tone,Province,Issue,Similar,Summary,Party One,Party Two
0,13 Apr 2017,Apr,2017,FSC conditionally approves plan to end suspens...,Eco-business.com,Negatif,Sumatera Utara,Perambahan hutan,APP,Satu dekade setelah Forest Stewardship Council...,PT APP,Aktivis lingkungan
1,09 Jun 2017,Jun,2017,'Give us back our land': paper giants struggle...,Mongabay.com,Negatif,Jambi,Alih fungsi lahan,APP,Rainforest Action Network (RAN) merilis platfo...,Asia Pulp and Paper,. Toba Pulp Lestari
2,10 Oct 2017,Oct,2017,Indonesia dijual : Mengungkap relasi tersembun...,Mongabay.co.id,Negatif,Indonesia,Lahan sawit,"Investigasi ""Indonesia Dijual""","Dalam seri investigasi “Indonesia Dijual”, “Mo...",Perusahaan kelapa sawit,Masyarakat


In [27]:
data_eng = pd.read_csv(data_location_en)
cols_to_translate = ['Headline', 'Issue', 'Similar', 'Summary', 'Party One', 'Party Two']

In [None]:
for col in cols_to_translate[5:6]:
    print(col)
    for row in range(0, len(data_eng)):
        print(row)
        text = translate_worker(data_eng[col][row])
        print(f'Translated {data_eng[col][row]} to {text} for column {col}, row {row}')
        print('\n')
        data_eng[col][row] = text

In [49]:
data_eng.to_csv("../data/indonesia/groundtruth-indonesia-en.csv", index = False)

# Visualize results

In [5]:
data_eng = pd.read_csv(data_location_en)
profile = ProfileReport(data_eng)

In [13]:
#profile.to_widgets()