Now I have a huge database of french sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [76]:
import os
import re
import pandas as pd
import constants

In [77]:
constants.language_code = 'de'

In [78]:
filepath = f"../output_files/{constants.language_code}/step2_sentences.csv"

df = pd.read_csv(filepath, delimiter='\t')

In [79]:
df.count()

id          2355301
sentence    2355301
dtype: int64

In [80]:
# First get all sentences with three or more of the same roman character in a row
# TODO: this needs to include special characters used in each language
df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [81]:
df_filtered.count()

id          282
sentence    282
dtype: int64

In [82]:
df_filtered.head()

Unnamed: 0,id,sentence
2039,2620,1.13 ist doch eine Mod( und zwar eine seeeeeee...
5026,6404,17.37 Uhr: Neumayer könnte indes noch über die...
9603,12299,"26. Minute: Eeeeey Schiri, das war Handspiel!"
12191,15597,39. Minute (2. Drittel) Toooooooooor für die L...
13140,16697,3. Oleeee Oleeee Olee Brot und Spiele mal ganz...


In [83]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [84]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [85]:
df.count()

id          2349588
sentence    2349588
dtype: int64

In [86]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19

regex = {
    'fr': r'\b\w{20,}\b',
    'de': r'\b\w{40,}\b' # Longest commonly-used word in german seems to be around 40 characters
}[constants.language_code]

df = df[~df['sentence'].str.contains(regex, regex=True, na=False)]

In [87]:
# Remove any grammatical weirdness
"""
..
''
""
--
=
>
<
??
((
))
#
+++
^
|
$$
££
€€
¥¥
%%
&&
*
!!
~
;
:
¬
"""

strings_to_remove = ['\.\.', '\.\.', "''", '""', '--', '=',
    '>', '<', '\?\?', '\(\(', '\)\)', '#', '\+\+\+', '\^',
    '\|', '\$\$', '££', '€€', '¥¥', '%%', '&&', '\*', '!!',
    '~', ';', ':', '¬', '`']

# Create the regex pattern
regex = '|'.join(strings_to_remove)

df = df[~df['sentence'].str.contains(fr'{regex}', regex=True, na=False)]

In [88]:
df.count()

id          2057956
sentence    2057956
dtype: int64

In [89]:
# Remove any sentences that contain three or more unique strings of numbers. This is fairly
# restrictive but the dataset is large so I can afford to be picky

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is three or more
df = df[df['number_count'] < 3]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [90]:
df.count()

id          1985158
sentence    1985158
dtype: int64

In [91]:
# Remove any sentences that don't start with an allowed roman character

# Define the list of allowed starting characters
allowed_chars = {
    'fr': '[a-zA-ZéèêëÉÈÊËàâäÀÂÄôöÔÖûüÛÜçÇîÎïÏ]',
    'de': '[a-zA-ZäöüÄÖÜß]'
    }[constants.language_code]

pattern = f'^[^{allowed_chars}]'

# Use pattern to remove sentences that don't start with the allowed characters
df = df[~df['sentence'].str.match(pattern)]

In [92]:
# Remove any sentences where there's an uneven number of quotation marks

# Define the quotation marks
quotation_marks = [
    "'", '"', '`', '«', '»', '‘', '’', '“', '”', '„', '‚', '「',
    '」', '『', '』', '《', '》', '〈', '〉', '‚', '‛', '„', '‟',
    '‘', '’', '“', '”', '‹', '›', '«', '»', '｢', '｣', '｡', '｠',
    '｟', '｠', '′', '″'
    ]

def has_uneven_quotation_marks(sentence):
    for mark in quotation_marks:
        if sentence.count(mark) % 2 != 0:
            return True
    return False

df = df[~df['sentence'].apply(has_uneven_quotation_marks)]


In [93]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step3_sentences.csv", sep='\t', index=False)