The best dataset I've found so far contains conversational sentences from film and series subtitles with translations for multiple languages:

https://opus.nlpl.eu/OpenSubtitles-v2018.php
https://github.com/PolyAI-LDN/conversational-datasets

In [1]:
import os
import constants
import pandas as pd

In [2]:
constants.language_code = 'th'

In [4]:
filepath_en = f"../input_files/{constants.language_code}/open_subtitles/OpenSubtitles_en-{constants.language_code}.en"
filepath_lang = f"../input_files/{constants.language_code}/open_subtitles/OpenSubtitles_en-{constants.language_code}.{constants.language_code}"

en_series = pd.read_csv(filepath_en, sep='\\t')
lang_series = pd.read_csv(filepath_lang, sep='\\t')

  en_series = pd.read_csv(filepath_en, sep='\\t')
  lang_series = pd.read_csv(filepath_lang, sep='\\t')


In [5]:
lang_series.head()

Unnamed: 0,"ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด"
0,ผ่านลมและความมืดฉันเรียกเจ้า
1,พูด!
2,ให้ฉันเห็นพระพักตร์ของ พระองค์
3,สิ่งที่เจ้าจะรู้ว่าสมเด็จพระราชินี ของฉันได้อย...
4,กระจกวิเศษบนผนัง ผู้ที่เป็นสังขารหนึ่งทั้งหมด ...


In [7]:
print(len(lang_series) == len(en_series))
print(f'Number of entries: {len(lang_series)}')

True
Number of entries: 3281532


In [8]:
# Combine into a dataframe and randomly sample n rows
n = 3_000_000

lang_series = lang_series.reset_index(drop=True)
en_series = en_series.reset_index(drop=True)

lang_series_list = lang_series.values.tolist()
en_series_list = en_series.values.tolist()

# Flatten the lists
lang_series_list = [item[0] for item in lang_series_list]
en_series_list = [item[0] for item in en_series_list]


df = pd.DataFrame({
    'sentence': lang_series_list,
    'translation': en_series_list
})

# First 100 or so lines are in English for some reason
#df = df[df.index > 100]

df_sample = df.sample(n, random_state=1)
df_sample.to_csv(f'../input_files/{constants.language_code}/open_subtitles_uncleaned_sentences.csv', sep='\t')

In [9]:
# OLD DATASET
"""
filepath = os.path.join(f"../input_files/{constants.language_code}", "uncleaned_sentences.csv")
df = pd.read_csv(filepath, delimiter='\t', header=None)
df.columns = ["id", "sentence"]
"""

filepath = os.path.join(f"../input_files/{constants.language_code}", "open_subtitles_uncleaned_sentences.csv")
df = pd.read_csv(filepath, delimiter='\t', header=None)
df.columns = ["id", "sentence", "translated_sentence"]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column               Dtype  
---  ------               -----  
 0   id                   float64
 1   sentence             object 
 2   translated_sentence  object 
dtypes: float64(1), object(2)
memory usage: 68.7+ MB


In [11]:
df.head()

Unnamed: 0,id,sentence,translated_sentence
0,,sentence,translation
1,1116037.0,อะไร?,What?
2,220465.0,นี่ค่ะที่รัก ให้หลับได้ง่ายบนเครื่องค่ะ,Here. Sweetie. Just take one of these to help ...
3,848749.0,- ว่า...,- And?
4,2248740.0,ช่างเป็นสุภาพสตรีจริงๆ,You are a classy lady.


In [12]:
# Let's see if there are any duplicates in the dataset
df[df["sentence"].duplicated(keep=False)].sort_values("sentence").head(8)

Unnamed: 0,id,sentence,translated_sentence
144251,2819307.0,!,!
2050728,3028639.0,!,You want to contaminate the whole damn sample?
1073639,352802.0,!,!
763706,2224500.0,!,!
1662933,352471.0,!,!
1269072,2327854.0,!,Please just talk to me.
2046295,352695.0,!,!
2044669,970718.0,!,!


In [13]:
df.dtypes

id                     float64
sentence                object
translated_sentence     object
dtype: object

In [14]:
# Remove all duplicates from the dataframe
df = df.drop_duplicates("sentence")

In [15]:
lengths: pd.Series = df['sentence'].str.len()
max_characters: int = lengths.max()
max_index = lengths.idxmax()

# Find the sentence with the most characters to see if there are any delimitation issues.
print(f'Longest sentence: {max_characters} characters')
print(df[df.index == max_index]['sentence'].values[0][:600]) # Print out first 600 characters

Longest sentence: 436 characters
เธกเธตเธญเธธเธเธฑเธเธดเนเธซเธเธธเนเธฅเนเธเนเธเนเธญเธขเน เธเธญเธเธเธตเนเนเธฃเธฒเนเธเธชเธญเธเนเธเธกเธเธฃเธฑเนเธเธฅเนเธฒเธชเธธเ เธเธกเนเธกเนเธญเธขเธฒเธเธเธฐเนเธฃเธตเธขเธเธกเธฑเธเธงเนเธฒ เธเนเธงเธเนเธงเธฅเธฒเธเธตเนเธกเธตเธเธงเธฒเธกเนเธเนเธเธเนเธงเธขเธเธฒเ เธญเธขเนเธฒเธเธเธฑเธเธเธเธเธตเนเธเธนเธเธชเธฐเธเธเธเธดเธ เนเธซเนเนเธเธญเธขเธนเนเนเธเธฅเนเธเธญเธเธซเธเนเธฒเธเธฒ


In [16]:
# Cut off any sentences longer than 200 or 
# shorter than 30 characters
df = df[
    (df['sentence'].str.len() < 200)
    & (df['sentence'].str.len() > 30)
     ]


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1085042 entries, 2 to 3000000
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   id                   1085042 non-null  float64
 1   sentence             1085042 non-null  object 
 2   translated_sentence  1085042 non-null  object 
dtypes: float64(1), object(2)
memory usage: 33.1+ MB


In [16]:
# Randomly sample n rows to get a reduced dataset for easier training while testing out this method. Set a seed for reproducability.
#n_rows = 30000

#reduced_df = df.sample(n=n_rows, random_state=1)

In [18]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step0_sentences.csv", sep='\t', index=False)
#reduced_df.to_csv("./french_sentences_reduced.csv", sep='\t', index=False)