In [1]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.0/770.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31


In [1]:
import pandas as pd
import seaborn as sns
import glob
import re
import math
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [4]:
de_files = sorted(glob.glob("de-fr/raw/*.de"))
fr_files = sorted(glob.glob("de-fr/raw/*.fr"))

In [5]:
print(de_files)
print(fr_files)

['de-fr/raw/Tatoeba.de-fr.de', 'de-fr/raw/wikimedia.de-fr.de']
['de-fr/raw/Tatoeba.de-fr.fr', 'de-fr/raw/wikimedia.de-fr.fr']


In [6]:
# read files and split into lines

de_lines = []

for file in de_files:
    with open(file) as f:
        for line in f.readlines():
            de_lines.append(line.rstrip(" \n"))
            
fr_lines = []

for file in fr_files:
    with open(file) as f:
        for line in f.readlines():
            fr_lines.append(line.rstrip(" \n"))

In [7]:
# check length

print(len(de_lines))
print(len(fr_lines))

185577
185577


In [8]:
# randomly check aligned text

print(de_lines[42208])
print(fr_lines[42208])

Erzähl meinem jungen Freund doch eine schöne Geschichte.
Raconte donc une belle histoire à mon jeune ami.


In [9]:
de_fr_df = pd.DataFrame(de_lines, columns = ["de"])
de_fr_df["fr"] = fr_lines
de_fr_df.head()

Unnamed: 0,de,fr
0,Lass uns etwas versuchen!,Essayons quelque chose !
1,Lass uns etwas versuchen!,Tentons quelque chose !
2,Ich muss schlafen gehen.,Je dois aller dormir.
3,Was ist das?,Qu'est-ce que c'est ?
4,Was ist das?,C'est quoi ?


In [10]:
def preprocess(text):
    
    text = str(text).lower()
    
    # remove parenthesized texts
    text = re.sub(r"\(.*?\)", "", text)
    
    # remove brackets
    text = re.sub(r"\[.*?\]", "", text)

    # remove quotation marks
    text = re.sub(r'(\<|\>|"|“|”|„|»|«)*', "", text)

    # remove http websites
    text = re.sub(r"(https?:\/\/)[a-zA-Z1-9_.@?=#\/*]*", "", text)

    # remove other symbols
    text = re.sub(r"(\*|\+|@|#|:|;)*", "", text)
    
    # remove parenthesis again
    text = text.replace("(", "").replace(")", "")

    # trim extra whitespace
    text = re.sub(r' {2,100}', "", text)

    return text

In [11]:
de_fr_df["de"] = de_fr_df["de"].apply(preprocess)
de_fr_df["fr"] = de_fr_df["fr"].apply(preprocess)

In [12]:
def smart_truncate(content, length = 90, suffix = '.'):
    if len(content) <= length:
        return content
    else:
        return ' '.join(content[:length + 1].split(' ')[0:-1]) + suffix

In [13]:
de_fr_df["de"] = de_fr_df["de"].apply(smart_truncate)
de_fr_df["fr"] = de_fr_df["fr"].apply(smart_truncate)

In [13]:
de_fr_df = de_fr_df.dropna()
de_fr_df = de_fr_df.drop_duplicates(subset = ["de"])

# remove sequences too short
de_fr_df = de_fr_df.drop(de_fr_df[de_fr_df['de'].map(len) < 3].index)
de_fr_df = de_fr_df.drop(de_fr_df[de_fr_df['fr'].map(len) < 3].index)

In [28]:
print(len(de_fr_df))

165778


In [26]:
def nltk_tokenize(text):
    tokenized = word_tokenize(text)
    if len(tokenized[-1]) != 1:
        tokenized.append(".")
    return " ".join(tokenized)

In [29]:
de_fr_df["de"] = de_fr_df["de"].apply(nltk_tokenize)
de_fr_df["fr"] = de_fr_df["fr"].apply(nltk_tokenize)

In [30]:
# train test split

de_fr_train, de_fr_test = train_test_split(de_fr_df, test_size = 0.3)

In [31]:
# write files

with open("de-fr/parallel/train.de", "w") as file:
    for line in de_fr_train["de"]:
        file.write(line + "\n")
        
with open("de-fr/parallel/train.fr", "w") as file:
    for line in de_fr_train["fr"]:
        file.write(line + "\n")

with open("de-fr/parallel/test.de", "w") as file:
    for line in de_fr_test["de"]:
        file.write(line + "\n")
        
with open("de-fr/parallel/test.fr", "w") as file:
    for line in de_fr_test["fr"]:
        file.write(line + "\n")