Now I have a huge database of sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [1]:
import os
import re
import pandas as pd
import constants

In [2]:
constants.language_code = 'fr'

In [3]:
filepath = f"../output_files/{constants.language_code}/step2_sentences.csv"

df = pd.read_csv(filepath, delimiter='\t')

In [4]:
df.count()

id             1276184
sentence       1276184
translation    1276184
dtype: int64

In [5]:
# First get all sentences with three or more of the same roman character in a row
# TODO: this needs to include special characters used in each language
df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [6]:
df_filtered.count()

id             220
sentence       220
translation    220
dtype: int64

In [7]:
df_filtered.head()

Unnamed: 0,id,sentence,translation
3787,6594150.0,{\fs72\b0\cHFFFFFF}Tu mens et tu le sais.,That's a lie and you know it.
54961,39221626.0,{\1cH00ffff}Une vraie dent de dinosaure.,That is a genuine dinosaur tooth.
57332,17786412.0,Je voudrais finir mon morse à la crème ! - Mmm...,Can't I have a moment of peace and quiet to ea...
59945,16253612.0,{\q2\a2\cHffffff}{\cH00ffff} Après toi.,After you.
69054,16253226.0,{\q2\a2\cHffffff}{\cH00ffff} J'ai du travail.,I've got work to do.


In [8]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [9]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [10]:
df.count()

id             1275910
sentence       1275910
translation    1275910
dtype: int64

In [11]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19

regex = {
    'fr': r'\b\w{20,}\b',
    'de': r'\b\w{40,}\b' # Longest commonly-used word in german seems to be around 40 characters
}[constants.language_code]

df = df[~df['sentence'].str.contains(regex, regex=True, na=False)]

In [12]:
# Remove any grammatical weirdness
"""
..
''
""
--
=
>
<
??
((
))
#
+++
^
|
$$
££
€€
¥¥
%%
&&
*
!!
~
;
:
¬
œ
"""

strings_to_remove = ['\.\.', '\.\.', "''", '""', '--', '=',
    '>', '<', '\?\?', '\(\(', '\)\)', '#', '\+\+\+', '\^',
    '\|', '\$\$', '££', '€€', '¥¥', '%%', '&&', '\*', '!!',
    '~', ';', ':', '¬', '`', 'œ']

# Create the regex pattern
regex = '|'.join(strings_to_remove)

df = df[~df['sentence'].str.contains(fr'{regex}', regex=True, na=False)]

In [13]:
df.count()

id             1162550
sentence       1162550
translation    1162550
dtype: int64

In [14]:
# Remove any sentences that contain three or more unique strings of numbers. This is fairly
# restrictive but the dataset is large so I can afford to be picky

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is three or more
df = df[df['number_count'] < 3]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [15]:
df.count()

id             1160914
sentence       1160914
translation    1160914
dtype: int64

In [16]:
# Remove any sentences that don't start with an allowed roman character

# Define the list of allowed starting characters
allowed_chars = {
    'fr': '[a-zA-ZéèêëÉÈÊËàâäÀÂÄôöÔÖûüÛÜçÇîÎïÏ]',
    'de': '[a-zA-ZäöüÄÖÜß]'
    }[constants.language_code]

pattern = f'^[^{allowed_chars}]'

# Use pattern to remove sentences that don't start with the allowed characters
df = df[~df['sentence'].str.match(pattern)]

In [19]:
# Remove sentences starting with a space or -. Not sure why these weren't removed before
df = df[~df['sentence'].str.startswith((' ', '-'))]

In [17]:
# Remove any sentences where there's an uneven number of quotation marks

# Define the quotation marks
quotation_marks = [
    "'", '"', '`', '«', '»', '‘', '’', '“', '”', '„', '‚', '「',
    '」', '『', '』', '《', '》', '〈', '〉', '‚', '‛', '„', '‟',
    '‘', '’', '“', '”', '‹', '›', '«', '»', '｢', '｣', '｡', '｠',
    '｟', '｠', '′', '″'
    ]

def has_uneven_quotation_marks(sentence):
    for mark in quotation_marks:
        if sentence.count(mark) % 2 != 0:
            return True
    return False

df = df[~df['sentence'].apply(has_uneven_quotation_marks)]


In [20]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step3_sentences.csv", sep='\t', index=False)