Now I have a huge database of french sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [24]:
import os
import re
import pandas as pd
import constants

In [25]:
filepath = f"../output_files/{constants.language_code}/step2_sentences.csv"

df = pd.read_csv(filepath, delimiter='\t')

In [26]:
df.count()

id          826973
sentence    826973
dtype: int64

In [27]:
# First get all sentences with three or more of the same roman character in a row
# TODO: this needs to include special characters used in each language
df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [28]:
df_filtered.count()

id          95
sentence    95
dtype: int64

In [29]:
df_filtered.head()

Unnamed: 0,id,sentence
2731,3338,"Cher M XXXXXX, CANALSAT lance aujourd'hui une ..."
3424,4178,Rougui est passé par là: il a fait un gros tra...
16987,20460,ZZZzzz C'est vraiment le dernier?
22584,27182,"Attention : ""La BCD"" nécessite que le format d..."
28052,33661,Mon espérence sur ce peut-être nouveau zelda s...


In [30]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [31]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [32]:
df.count()

id          825083
sentence    825083
dtype: int64

In [33]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19

regex = {
    'fr': r'\b\w{20,}\b',
    'de': r'\b\w{40,}\b' # Longest commonly-used word in german seems to be around 40 characters
}[constants.language_code]

df = df[~df['sentence'].str.contains(regex, regex=True, na=False)]

In [34]:
# Remove any grammatical weirdness
"""
..
''
""
--
=
>
<
??
((
))
#
+++
^
|
$$
££
€€
¥¥
%%
&&
*
!!
~
;
:
¬
"""

strings_to_remove = ['\.\.', '\.\.', "''", '""', '--', '=',
    '>', '<', '\?\?', '\(\(', '\)\)', '#', '\+\+\+', '\^',
    '\|', '\$\$', '££', '€€', '¥¥', '%%', '&&', '\*', '!!',
    '~', ';', ':', '¬', '`']

# Create the regex pattern
regex = '|'.join(strings_to_remove)

df = df[~df['sentence'].str.contains(fr'{regex}', regex=True, na=False)]

In [35]:
df.count()

id          737905
sentence    737905
dtype: int64

In [36]:
# Remove any sentences that contain three or more unique strings of numbers. This is fairly
# restrictive but the dataset is large so I can afford to be picky

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is three or more
df = df[df['number_count'] < 3]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [37]:
df.count()

id          698449
sentence    698449
dtype: int64

In [38]:
# Remove any sentences that don't start with an allowed roman character

# Define the list of allowed starting characters
allowed_chars = {
    'fr': '[a-zA-ZéèêëÉÈÊËàâäÀÂÄôöÔÖûüÛÜçÇîÎïÏ]',
    'de': '[a-zA-ZäöüÄÖÜß]'
    }

pattern = f'^[^{allowed_chars}]'

# Use pattern to remove sentences that don't start with the allowed characters
sentences = ["Your sentences go here."]
clean_sentences = [sentence for sentence in sentences if not re.match(pattern, sentence)]

In [39]:
# Remove any sentences where there's an uneven number of quotation marks

# Define the quotation marks
quotation_marks = [
    "'", '"', '`', '«', '»', '‘', '’', '“', '”', '„', '‚', '「',
    '」', '『', '』', '《', '》', '〈', '〉', '‚', '‛', '„', '‟',
    '‘', '’', '“', '”', '‹', '›', '«', '»', '｢', '｣', '｡', '｠',
    '｟', '｠', '′', '″'
    ]

def has_uneven_quotation_marks(sentence):
    for mark in quotation_marks:
        if sentence.count(mark) % 2 != 0:
            return True
    return False

df = df[~df['sentence'].apply(has_uneven_quotation_marks)]


In [40]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step3_sentences.csv", sep='\t', index=False)