# Sentence data preprocessing

In [None]:
# !pip install pandas

In [2]:
import pandas as pd

## 1. Select sentences in english that have audio

This section of data preprocessing will focus on selecting english sentences that have an existing audio file.

In [3]:
# Read sentences in english
english_sen = pd.read_csv('datasets/eng_sentences_CC0.tsv', 
                          sep='\t',
                          names=['id', 'lang', 'text', 'date_last_modified'])

# Read sentences in all languages with audio
audio_sen = pd.read_csv('datasets/sentences_with_audio.csv', 
                        sep='\t',
                        names = ['id', 'audio_id', 'username', 'license', 'attribution_url'])

In [33]:
english_sen.head(3)

Unnamed: 0,id,lang,text,date_last_modified
0,330998,eng,Children who spend more time outdoors have a l...,2019-01-12 19:39:42
1,331000,eng,The idea that reading makes you short-sighted ...,2019-01-12 19:39:42
2,331259,eng,Most people think computers will never be able...,2019-01-12 19:39:42


In [39]:
columns = english_sen.columns.tolist()
for value in english_sen.values.tolist():
    pairs = zip(columns, value)
    for key, value in pairs:
        print(key, value)

id 330998
lang eng
text Children who spend more time outdoors have a lower risk of myopia.
date_last_modified 2019-01-12 19:39:42
id 331000
lang eng
text The idea that reading makes you short-sighted has been popular for a couple of hundred years.
date_last_modified 2019-01-12 19:39:42
id 331259
lang eng
text Most people think computers will never be able to think.
date_last_modified 2019-01-12 19:39:42
id 332331
lang eng
text It is more time-efficient to do several tasks sequentially than attempt to do them simultaneously.
date_last_modified 2019-01-12 19:39:42
id 334553
lang eng
text I should stop procrastinating.
date_last_modified 2019-01-12 19:39:43
id 336509
lang eng
text I know you think you understood what you thought I said, but I'm not sure you realized that what you heard is not what I meant.
date_last_modified 2019-01-12 19:39:43
id 337215
lang eng
text How can video games inspire us to make better applications for e-learning?
date_last_modified 2019-01-12 19:39:43
id 38401

In [31]:
audio_sen.head(3)

Unnamed: 0,id,audio_id,username,license,attribution_url
0,61,1,fucongcong,,
1,68,2,fucongcong,,
2,78,754915,mramosch,,


In [32]:
# Remove entries for sentences with audio that don't have a license
audio_sen = audio_sen[audio_sen['license'].notna()]

In [33]:
# Extract id of english sentences as a list
id_english = english_sen['id'].values.tolist()
print(id_english[:5])

[330998, 331000, 331259, 332331, 334553]


In [34]:
# Extract sentences with audio that are in english
eng_audio_sen = audio_sen[audio_sen['id'].isin(id_english)]

In [35]:
eng_audio_sen.head(3)

Unnamed: 0,id,audio_id,username,license,attribution_url
59840,331259,800678,CK,CC BY-NC-ND 3.0,http://www.manythings.org/tatoeba
61685,334553,27179,CK,CC BY-NC-ND 3.0,http://www.manythings.org/tatoeba
72813,403859,1123747,Them,CC BY 4.0,


## 2. Find spanish translations of english sentences

Once only the english sentences with valid audio files have been selected, the goal of this step will be to filter the file with english sentences and their spanish translations, with the sentences that have audio files. 

In [36]:
# Read all english sentences with spanish translations
filename = 'Sentence pairs in English-Spanish.tsv'
engspa_trans = pd.read_csv(f'datasets/{filename}',
                           sep='\t',
                           usecols=range(4),
                           names=['eng_id', 'eng_text', 'spa_id', 'spa_text'])

In [37]:
# Extract english sentences with audio ids
eng_audio_ids = eng_audio_sen['id'].values.tolist()

In [38]:
# Select only those translated sentences that also have audio
engspa_trans = engspa_trans[engspa_trans['eng_id'].isin(eng_audio_ids)]

In [39]:
engspa_trans.head(3)

Unnamed: 0,eng_id,eng_text,spa_id,spa_text
60223,403859,"If I could rearrange the alphabet, I would put...",690143,"Si pudiera reordenar el alfabeto, pondría la T..."
60224,403860,I'm not good at multitasking.,1612871,No soy bueno para hacer varias cosas a la vez.
60786,414272,Any teacher that can be replaced by a machine ...,627877,Cualquier profesor que pueda ser reemplazado p...


In [40]:
# Drop entries with duplicated english id 
engspa_trans = engspa_trans[engspa_trans.duplicated('eng_id', keep='first') != True]

In [41]:
# Sort the data entries by english sentence id 
engspa_trans = engspa_trans.sort_values(by='eng_id')

In [42]:
# Extract ids of valid translated sentences
engspa_trans_ids = engspa_trans['eng_id'].values.tolist()
print(len(engspa_trans_ids))

632


In [43]:
# Retrieve the audio links of the english sentences with translation
engspa_audios = eng_audio_sen[eng_audio_sen['id'].isin(engspa_trans_ids)]

In [44]:
# Drop entries with duplicated english id 
engspa_audios = engspa_audios[engspa_audios.duplicated('id', keep='first') != True]
len(engspa_audios)

632

In [45]:
# Sort the entries by english id
engspa_audios = engspa_audios.sort_values(by='id')

In [46]:
# Create a new dataframe with sentence, translation and audio link
data = {
    'eng_id': engspa_trans['eng_id'].values,
    'eng_sentence': engspa_trans['eng_text'].values,
    'spa_id': engspa_trans['spa_id'].values,
    'spa_sen': engspa_trans['spa_text'].values,
    'audio_id': engspa_audios['audio_id'].values
}
clean_sentences = pd.DataFrame(data=data)

In [49]:
clean_sentences.head()

Unnamed: 0,eng_id,eng_sentence,spa_id,spa_sen,audio_id
0,403859,"If I could rearrange the alphabet, I would put...",690143,"Si pudiera reordenar el alfabeto, pondría la T...",1123747
1,403860,I'm not good at multitasking.,1612871,No soy bueno para hacer varias cosas a la vez.,32210
2,414272,Any teacher that can be replaced by a machine ...,627877,Cualquier profesor que pueda ser reemplazado p...,911934
3,618394,No words can express how amazing you are.,1011397,No existen palabras para expresar lo increíble...,906756
4,618396,It's rare to meet nice people like you.,1011395,Es difícil conocer a gente tan agradable como tú.,906757


In [48]:
# Save final file
clean_sentences.to_csv('datasets/eng_spa_audio_sentences.csv', index=False)