Now I have a huge database of french sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [31]:
import os
import re
import pandas as pd

In [32]:
language_code = 'de'

filepath = f'../output_files/{language_code}/step2_sentences.csv'

df = pd.read_csv(filepath, delimiter='\t') #, delimiter='\t', header=None)

In [33]:
df.count()

id          2355301
sentence    2355301
dtype: int64

In [34]:
df.count()

id          2355301
sentence    2355301
dtype: int64

In [35]:
# First get all sentences with three or more of the same roman character in a row
# TODO: Change this to include specific characters for each language
df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [36]:
df_filtered.count()

id          282
sentence    282
dtype: int64

In [37]:
df_filtered.head()

Unnamed: 0,id,sentence
2039,2620,1.13 ist doch eine Mod( und zwar eine seeeeeee...
5026,6404,17.37 Uhr: Neumayer könnte indes noch über die...
9603,12299,"26. Minute: Eeeeey Schiri, das war Handspiel!"
12191,15597,39. Minute (2. Drittel) Toooooooooor für die L...
13140,16697,3. Oleeee Oleeee Olee Brot und Spiele mal ganz...


In [38]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [39]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [40]:
df.count()

id          2349588
sentence    2349588
dtype: int64

In [41]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19
max_word_regex = {
    'fr': r'\b\w{20,}\b', # anticonstitutionnellement (20) longest relatively used word
    'de': r'\b\w{40,}\b' # Rechtsschutzversicherungsgesellschaften (39)
}
# NOTE: \w shorthand only includes non-special letters and numbers in regex I think, so need to change this
df = df[~df['sentence'].str.contains(max_word_regex[language_code], regex=True, na=False)]

In [42]:
# Remove any grammatical weirdness
"""
..
''
""
--
==
>
<
??
((
))
#
+++
^
|
$$
££
€€
¥¥
%%
&&
*
!!
~
;
:
¬
`

Redundant for now:
->
=>
:)
:-)
:(
"""

unwanted_substrings = ['\.\.', '\.\.', "''", '""', '--', '==', '>',
    '<', '›', '‹', '\?\?', '\(\(', '\)\)', '#', '\+\+\+', '\^', '\|', '\$\$', '££', '€€', '¥¥',
    '%%', '&&', '\*', '!!', '~', ';', ':', '¬', '`'
    ]

regex = '|'.join(unwanted_substrings)

#df = df[~df['sentence'].str.contains(r'\.\.|\'\'|\"\"|--|==|->|=>|:\)|:-\)|:\(|\?\?|\(\(|\)\)', regex=True, na=False)]#|\+\+\+|^|$$|££|%%|&&|\*\*|!!|~|;;|¬|`', regex=True, na=False)]
df = df[~df['sentence'].str.contains(fr'{regex}')]

In [43]:
df.count()

id          2057956
sentence    2057956
dtype: int64

In [46]:
# Remove any sentences that contain five or more unique strings of numbers. This is fairly
# restrictive but the dataset is large so I can afford to be picky. It should also keep in
# years

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is five or more
df = df[df['number_count'] <= 4]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [47]:
df.count()

id          2044233
sentence    2044233
dtype: int64

In [48]:
def has_odd_quotes(s: str) -> bool:
    quotes = ["'", '"', '“', '”', '‘', '’', '«', '»', '「', '」']
    
    for quote in quotes:
        if s.count(quote) % 2 != 0:
            return True
    return False

# Remove all sentences with an odd number of quotes
df = df[~df['sentence'].apply(has_odd_quotes)]

In [49]:
# Remove all sentences that start with a number
df = df[~df['sentence'].apply(lambda s: s[0].isdigit())]

# TODO: Remove all sentences that start with any non-valid character

In [50]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{language_code}/step3_sentences.csv", sep='\t', index=False)

#reduced_df.to_csv("./reduced_french_sentences_cleaned_with_cluster_labels.csv", sep='\t', index=False)