# 1 Introduction

## 1.0 Package imports

In [1]:
import numpy as np
import pandas as pd
from googletrans import Translator
import time
import warnings
import tabula

# 2 Definitions

## 2.0 Parameter definitions

In [2]:
data_location = '../data/brazil/groundtruth-brazil.csv'
data_location_en = '../data/brazil/groundtruth-brazil-en.csv'
data_location_pdf = '../data/brazil/groundtruth-brazil.pdf'

## 2.1 Function definitions

In [35]:
translator = Translator()

def translate_worker(string: str, translator: 'Translator' = translator) -> str:
    if string == 'nan' or pd.isna(string):
        return string
    
    language = translator.detect(string)
    if language.lang != 'en':
        time.sleep(3)
        string_trans = translator.translate(string)
        if string_trans.src in ['pt', 'es']:
        #assert (string.src in ['id', 'ms', 'jw', 'su', 'gu']), f'Incorrect input language of {string.src}'
            string = string_trans.text
        else:
            warnings.warn(f'Incorrect language of {string_trans.src}')
    return string
    

# 3 Execution

## 3.0 Convert PDF to CSV

In [36]:
df = tabula.read_pdf(data_location_pdf, pages='all', encoding='utf-8')
df_colnames = ['State', 'City', 'Conflict Name', 'Area', 'Date',
               'Families Involved', 'Property Type', 'Jurisdiction', 
               'Families displaced', 'Attempt Threat Expulsion', 'Eviction',
               'Eviction threats', 'Houses Destroyed', 'Land destroyed', 'Belongings Destroyed', 
               'Guns', 'Invasion', 'Result', 'Cause', 'Type of Violence']

cols_to_keep = ['State', 'City', 'Conflict Name', 'Area', 'Date', 'Families Involved', 'Property Type',
                'Jurisdiction', 'Result', 'Cause', 'Type of Violence']

for x in range(len(df)):
    df[x].columns = df_colnames

df = pd.concat(df)
df = df.drop([x for x in df.columns if x not in cols_to_keep], axis = 1)
df = df[df['State'] != 'Estado']
df = df.dropna(thresh = df.shape[1] -3)
df = df.reset_index()
df = df.drop('index', axis = 1)

for column in df.columns:
    for row in range(0, len(df)):
        if isinstance(df[column][row], str):
            df[column][row] = df[column][row].replace('\r', ' ')
            df[column][row] = df[column][row].replace('/', ' ')
        

In [37]:
df.tail(5)

Unnamed: 0,State,City,Conflict Name,Area,Date,Families Involved,Property Type,Jurisdiction,Result,Cause,Type of Violence
903,TO,Porto Nacional,P. A. Retiro Acamp. D. Celso Pereira de Almeida,,21 09 2018,40,Área de assentamento,Assentamen to,Sem Terra,Governo federal,
904,TO,Porto Nacional,Faz. Chianini Acamp. Marielle Vive,,31 07 2018,20,Pública Grilad a,Litígio,Sem Terra,Grileiro,Ameaça de Morte
905,TO,Santa Tereza do Tocantins,Comunidade Quilombola Barra do Aroeira,,31 12 2018,174,Área quilombola,Certificada,Quilombolas,Fazendeiro,Ameaça de Expropriação
906,TO,São Félix do Tocantins,Comunidade Quilombola do Rio do Prata,,31 12 2018,78,Área quilombola,Em fase de Reconhecim ento,Quilombolas,Fazendeiro,Impedimento de ir e vir
907,TO,Tocantinópolis Maurilândia do Tocantins,T. I. Apinajé Apinayés UHE Serra Quebrada PAC,,14 02 2018,227,Área indígena,Homologada,Indígenas,Fazendeiro,Danos


In [38]:
df.to_csv(data_location, index = False)

## 3.1 Translate Portuguese to English

In [62]:
data = pd.read_csv(data_location_en)
cols_to_translate = ['Property Type', 'Jurisdiction', 
                     'Result', 'Cause', 'Type of Violence'] # 0-1 already done
data.tail(5)

Unnamed: 0,State,City,Conflict Name,Area,Date,Families Involved,Property Type,Jurisdiction,Result,Cause,Type of Violence
903,TO,Porto Nacional,P. A. Retiro Acamp. D. Celso Pereira de Almeida,,21 09 2018,40.0,Nesting Area,Assentamen to,Sem Terra,Governo federal,
904,TO,Porto Nacional,Faz. Chianini Acamp. Marielle Vive,,31 07 2018,20.0,Public Grilad to,Litígio,Sem Terra,Grileiro,Ameaça de Morte
905,TO,Santa Tereza do Tocantins,Comunidade Quilombola Barra do Aroeira,,31 12 2018,174.0,quilombo area,Certificada,Quilombolas,Fazendeiro,Ameaça de Expropriação
906,TO,São Félix do Tocantins,Comunidade Quilombola do Rio do Prata,,31 12 2018,78.0,quilombo area,Em fase de Reconhecim ento,Quilombolas,Fazendeiro,Impedimento de ir e vir
907,TO,Tocantinópolis Maurilândia do Tocantins,T. I. Apinajé Apinayés UHE Serra Quebrada PAC,,14 02 2018,227.0,indigenous area,Homologada,Indígenas,Fazendeiro,Danos


In [59]:
for i in range(len(data)):
    data['Jurisdiction'][i] = data['Jurisdiction'][i].replace(" ento", "ento")
    if data['Jurisdiction'][i].count(" ") == 1:
        data['Jurisdiction'][i] = data['Jurisdiction'][i].replace(" ", "")
    data['Jurisdiction'][i] = data['Jurisdiction'][i].replace("  ç", "ç")
    data['Jurisdiction'][i] = data['Jurisdiction'][i].replace("aT", "a T")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [63]:
for col in cols_to_translate[1:2]:
    print(col)
    for row in range(0, len(data)):
        print(row)
        text = translate_worker(data[col][row])
        print(f'Translated {data[col][row]} to {text} for column {col}, row {row}')
        print('\n')
        data[col][row] = text

Jurisdiction
0
Translated Litígio to litigation for column Jurisdiction, row 0


1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Translated Litígio to litigation for column Jurisdiction, row 1


2
Translated Litígio to litigation for column Jurisdiction, row 2


3
Translated Litígio to litigation for column Jurisdiction, row 3


4
Translated Litígio to litigation for column Jurisdiction, row 4


5
Translated Litígio to litigation for column Jurisdiction, row 5


6
Translated Litígio to litigation for column Jurisdiction, row 6


7
Translated Litígio to litigation for column Jurisdiction, row 7


8
Translated Litígio to litigation for column Jurisdiction, row 8


9
Translated Litígio to litigation for column Jurisdiction, row 9


10
Translated Litígio to litigation for column Jurisdiction, row 10


11
Translated Litígio to litigation for column Jurisdiction, row 11


12
Translated Litígio to litigation for column Jurisdiction, row 12


13
Translated Litígio to litigation for column Jurisdiction, row 13


14
Translated Litígio to litigation for column Jurisdiction, row 14


15
Translated Litígio to litigation for 

Translated Litígio to litigation for column Jurisdiction, row 114


115
Translated Homologada to homologated for column Jurisdiction, row 115


116
Translated Homologada to homologated for column Jurisdiction, row 116


117
Translated Demarcada to demarcated for column Jurisdiction, row 117


118
Translated Homologada to homologated for column Jurisdiction, row 118


119
Translated Homologada to homologated for column Jurisdiction, row 119


120
Translated Litígio to litigation for column Jurisdiction, row 120


121
Translated Litígio to litigation for column Jurisdiction, row 121


122
Translated Homologada to homologated for column Jurisdiction, row 122


123
Translated Sem informação to No information for column Jurisdiction, row 123


124
Translated Litígio to litigation for column Jurisdiction, row 124


125
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 125


126
Translated Em fase de Reconhecim ento to In recognition phase for column J

Translated Litígio to litigation for column Jurisdiction, row 223


224
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 224


225
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 225


226
Translated Litígio to litigation for column Jurisdiction, row 226


227
Translated Litígio to litigation for column Jurisdiction, row 227


228
Translated Litígio to litigation for column Jurisdiction, row 228


229
Translated Litígio to litigation for column Jurisdiction, row 229


230
Translated Litígio to litigation for column Jurisdiction, row 230


231
Translated Delimitada to bounded for column Jurisdiction, row 231


232
Translated Litígio to litigation for column Jurisdiction, row 232


233
Translated Litígio to litigation for column Jurisdiction, row 233


234
Translated Litígio to litigation for column Jurisdiction, row 234


235
Translated Litígio to litigation for column Jurisdiction, row 235


236
Transl

Translated Litígio to litigation for column Jurisdiction, row 332


333
Translated Desapropria da to expropriate the for column Jurisdiction, row 333


334
Translated Assentamen to to Assentamen to for column Jurisdiction, row 334


335
Translated Termo de Autorização de Uso to Terms of Service Authorization for column Jurisdiction, row 335


336
Translated Certificada to certified for column Jurisdiction, row 336


337
Translated Reconhecid a to Reconhecid the for column Jurisdiction, row 337


338
Translated Litígio to litigation for column Jurisdiction, row 338


339
Translated Litígio to litigation for column Jurisdiction, row 339


340
Translated Litígio to litigation for column Jurisdiction, row 340


341
Translated Litígio to litigation for column Jurisdiction, row 341


342
Translated Litígio to litigation for column Jurisdiction, row 342


343
Translated Litígio to litigation for column Jurisdiction, row 343


344
Translated Em fase de Reconhecim ento to In recognition phase f

Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 441


442
Translated Assentamen to to Assentamen to for column Jurisdiction, row 442


443
Translated Homologada to homologated for column Jurisdiction, row 443


444
Translated Homologada to homologated for column Jurisdiction, row 444


445
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 445


446
Translated Assentamen to to Assentamen to for column Jurisdiction, row 446


447
Translated Litígio to litigation for column Jurisdiction, row 447


448
Translated Litígio to litigation for column Jurisdiction, row 448


449
Translated Litígio to litigation for column Jurisdiction, row 449


450
Translated Litígio to litigation for column Jurisdiction, row 450


451
Translated Homologada to homologated for column Jurisdiction, row 451


452
Translated Litígio to litigation for column Jurisdiction, row 452


453
Translated Litígio to litigation for column Juri

Translated Litígio to litigation for column Jurisdiction, row 548


549
Translated Litígio to litigation for column Jurisdiction, row 549


550
Translated Homologada to homologated for column Jurisdiction, row 550


551
Translated Área Titulada to Entitled area for column Jurisdiction, row 551


552
Translated Litígio to litigation for column Jurisdiction, row 552


553
Translated Litígio to litigation for column Jurisdiction, row 553


554
Translated Litígio to litigation for column Jurisdiction, row 554


555
Translated Litígio to litigation for column Jurisdiction, row 555


556
Translated Litígio to litigation for column Jurisdiction, row 556


557
Translated Litígio to litigation for column Jurisdiction, row 557


558
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 558


559
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 559


560
Translated Área Titulada to Entitled area for column Jurisdiction

Translated Litígio to litigation for column Jurisdiction, row 657


658
Translated Litígio to litigation for column Jurisdiction, row 658


659
Translated Litígio to litigation for column Jurisdiction, row 659


660
Translated Litígio to litigation for column Jurisdiction, row 660


661
Translated Litígio to litigation for column Jurisdiction, row 661


662
Translated Litígio to litigation for column Jurisdiction, row 662


663
Translated Litígio to litigation for column Jurisdiction, row 663


664
Translated Litígio to litigation for column Jurisdiction, row 664


665
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 665


666
Translated Litígio to litigation for column Jurisdiction, row 666


667
Translated Litígio to litigation for column Jurisdiction, row 667


668
Translated Litígio to litigation for column Jurisdiction, row 668


669
Translated Litígio to litigation for column Jurisdiction, row 669


670
Translated Litígio to litigation fo

Translated Litígio to litigation for column Jurisdiction, row 767


768
Translated Litígio to litigation for column Jurisdiction, row 768


769
Translated Homologada to homologated for column Jurisdiction, row 769


770
Translated Homologada to homologated for column Jurisdiction, row 770


771
Translated Litígio to litigation for column Jurisdiction, row 771


772
Translated Litígio to litigation for column Jurisdiction, row 772


773
Translated Assentamen to to Assentamen to for column Jurisdiction, row 773


774
Translated Em fase de Reconhecim ento to In recognition phase for column Jurisdiction, row 774


775
Translated Reconhecid a to Reconhecid the for column Jurisdiction, row 775


776
Translated Litígio to litigation for column Jurisdiction, row 776


777
Translated Homologada to homologated for column Jurisdiction, row 777


778
Translated Homologada to homologated for column Jurisdiction, row 778


779
Translated Homologada to homologated for column Jurisdiction, row 779


7

Translated Litígio to litigation for column Jurisdiction, row 877


878
Translated Litígio to litigation for column Jurisdiction, row 878


879
Translated Litígio to litigation for column Jurisdiction, row 879


880
Translated Delimitada to bounded for column Jurisdiction, row 880


881
Translated Certificada to certified for column Jurisdiction, row 881


882
Translated Litígio to litigation for column Jurisdiction, row 882


883
Translated Litígio to litigation for column Jurisdiction, row 883


884
Translated Litígio to litigation for column Jurisdiction, row 884


885
Translated Litígio to litigation for column Jurisdiction, row 885


886
Translated Litígio to litigation for column Jurisdiction, row 886


887
Translated Litígio to litigation for column Jurisdiction, row 887


888
Translated Assentamen to to Assentamen to for column Jurisdiction, row 888


889
Translated Litígio to litigation for column Jurisdiction, row 889


890
Translated Delimitada to bounded for column Jurisdic

In [None]:
data.to_csv(data_location_en, index = False)