The best dataset I've found so far contains conversational sentences from film and series subtitles with translations for multiple languages:

https://opus.nlpl.eu/OpenSubtitles-v2018.php
https://github.com/PolyAI-LDN/conversational-datasets

In [1]:
import os
import constants
import pandas as pd

In [2]:
constants.language_code = 'ru'

In [3]:
filepath_en = f"../input_files/{constants.language_code}/open_subtitles/OpenSubtitles_en-{constants.language_code}.en"
filepath_lang = f"../input_files/{constants.language_code}/open_subtitles/OpenSubtitles_en-{constants.language_code}.{constants.language_code}"

en_series = pd.read_csv(filepath_en, sep='\\t')
lang_series = pd.read_csv(filepath_lang, sep='\\t')

  en_series = pd.read_csv(filepath_en, sep='\\t')
  lang_series = pd.read_csv(filepath_lang, sep='\\t')


In [4]:
lang_series.head()

Unnamed: 0,"Дети могут достать во дворе почти всё что угодно до тех пор, пока могут себе это позволить."
0,Всё имеет свою цену.
1,"Эй, Ник."
2,У кого-нибудь есть клубничное молоко?
3,"Мы знаем, у кого есть клубничное молоко?"
4,"Да, думаю у Донни немножко есть."


In [5]:
print(len(lang_series) == len(en_series))

True


In [6]:
# Combine into a dataframe and randomly sample n rows
n = 3_000_000

lang_series = lang_series.reset_index(drop=True)
en_series = en_series.reset_index(drop=True)

lang_series_list = lang_series.values.tolist()
en_series_list = en_series.values.tolist()

# Flatten the lists
lang_series_list = [item[0] for item in lang_series_list]
en_series_list = [item[0] for item in en_series_list]


df = pd.DataFrame({
    'sentence': lang_series_list,
    'translation': en_series_list
})

# First 100 or so lines are in English for some reason
#df = df[df.index > 100]

df_sample = df.sample(n, random_state=1)
df_sample.to_csv(f'../input_files/{constants.language_code}/open_subtitles_uncleaned_sentences.csv', sep='\t')

In [7]:
# OLD DATASET
"""
filepath = os.path.join(f"../input_files/{constants.language_code}", "uncleaned_sentences.csv")
df = pd.read_csv(filepath, delimiter='\t', header=None)
df.columns = ["id", "sentence"]
"""

filepath = os.path.join(f"../input_files/{constants.language_code}", "open_subtitles_uncleaned_sentences.csv")
df = pd.read_csv(filepath, delimiter='\t', header=None)
df.columns = ["id", "sentence", "translated_sentence"]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column               Dtype  
---  ------               -----  
 0   id                   float64
 1   sentence             object 
 2   translated_sentence  object 
dtypes: float64(1), object(2)
memory usage: 68.7+ MB


In [9]:
df.head()

Unnamed: 0,id,sentence,translated_sentence
0,,sentence,translation
1,24166483.0,- Ты мне тоже не доверяешь.,"Makes two of us, 'cause you don't trust me."
2,3434427.0,В пятом классе она уже курила.,She was already smoking in fifth grade.
3,23423900.0,"Ну, знаешь, как это обычно с ней бывает.",You know how it is with her.
4,12658818.0,Так что вам срочно нужно начать химиотерапию. ...,"So, yeah, we're gonna wanna start a round of c..."


In [10]:
# Let's see if there are any duplicates in the dataset
df[df["sentence"].duplicated(keep=False)].sort_values("sentence").head(8)

Unnamed: 0,id,sentence,translated_sentence
1557786,21923185.0,!,!
2956786,7699210.0,!,!
269284,7636048.0,!,What?
1661043,22867940.0,!,!
1932077,17262288.0,!,!
2762898,6410406.0,!,!
2762930,19865253.0,!,!
1159202,3772628.0,!,!


In [11]:
df.dtypes

id                     float64
sentence                object
translated_sentence     object
dtype: object

In [12]:
# Remove all duplicates from the dataframe
df = df.drop_duplicates("sentence")

In [13]:
lengths: pd.Series = df['sentence'].str.len()
max_characters: int = lengths.max()
max_index = lengths.idxmax()

# Find the sentence with the most characters to see if there are any delimitation issues.
print(f'Longest sentence: {max_characters} characters')
print(df[df.index == max_index]['sentence'].values[0][:600]) # Print out first 600 characters

Longest sentence: 2443 characters
Comment: 0,0:02:15.95,0:02:17.85,Gold Jive-Silver Ocean,0,0,0,gradient @739 0,{\alphaHFF\t(0,160,\alphaH00)\t(1450,1610,\alphaHFF)}1{\alphaHFF\t(17,177,\alphaH00)\t(1467,1627,\alphaHFF)},{\alphaHFF\t(34,194,\alphaH00)\t(1484,1644,\alphaHFF)}2{\alphaHFF\t(51,211,\alphaH00)\t(1501,1661,\alphaHFF)},{\alphaHFF\t(68,228,\alphaH00)\t(1518,1678,\alphaHFF)}3 {\alphaHFF\t(85,245,\alphaH00)\t(1535,1695,\alphaHFF)}— {\alphaHFF\t(102,262,\alphaH00)\t(1552,1712,\alphaHFF)}н{\alphaHFF\t(119,279,\alphaH00)\t(1569,1729,\alphaHFF)}а{\alphaHFF\t(136,296,\alphaH00)\t(1586,1746,\alphaHFF)}ч{\alphaHFF\t(153,313,\a


In [14]:
# Cut off any sentences longer than 200 or 
# shorter than 30 characters
df = df[
    (df['sentence'].str.len() < 200)
    & (df['sentence'].str.len() > 30)
     ]


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1125846 entries, 3 to 2999994
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   id                   1125846 non-null  float64
 1   sentence             1125846 non-null  object 
 2   translated_sentence  1125846 non-null  object 
dtypes: float64(1), object(2)
memory usage: 34.4+ MB


In [16]:
# Randomly sample n rows to get a reduced dataset for easier training while testing out this method. Set a seed for reproducability.
#n_rows = 30000

#reduced_df = df.sample(n=n_rows, random_state=1)

In [18]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step0_sentences.csv", sep='\t', index=False)
#reduced_df.to_csv("./french_sentences_reduced.csv", sep='\t', index=False)