Now I have a huge database of sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [40]:
import os
import re
import pandas as pd
import constants

In [41]:
constants.language_code = 'fr'

In [42]:
filepath = f"../output_files/{constants.language_code}/step2_sentences.csv"

df = pd.read_csv(filepath, delimiter='\t')

In [43]:
df.count()

sentence               215109
translated_sentence    215109
dtype: int64

In [44]:
regex = {
    'fr': '[a-zA-ZéèêëÉÈÊËàâäÀÂÄôöÔÖûüÛÜçÇîÎïÏ]',
    'de': '[a-zA-ZäöüÄÖÜß]',
    'ru': '[А-Яа-яЁё]',
    'th': '[]'
}[constants.language_code]

# First get all sentences with three or more of the same character in a row
# TODO: this needs to include special characters used in each language
if constants.language_code != 'th':
    df_filtered = df[df['sentence'].str.contains(fr'({regex})\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(fr'({regex})\1\1\1', regex=True, na=False, case=False)]


In [45]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(fr'({regex})\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(fr'({regex})\1\1\1', regex=True, na=False, case=False)]


In [46]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [47]:
df.count()

sentence               215102
translated_sentence    215102
dtype: int64

In [48]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19

# NOTE: Shorthand w is only standard alphanumeric so have to change

regex = {
    'fr': r'\b\w{20,}\b',
    'de': r'\b\w{40,}\b', # Longest commonly-used word in german seems to be around 40 characters,
    'ru': r'\b\w{32,}\b', # рентгеноэлектрокардиографический
    'th': ''
}[constants.language_code]

if constants.language_code not in ['th']: # Not applicable for thai
    df = df[~df['sentence'].str.contains(regex, regex=True, na=False)]

In [49]:
# Remove any grammatical weirdness

strings_to_remove = ['\\', '\.\.', "''", '""', '--', '=',
    '>', '<', '\?\?', '\(\(', '\)\)', '#', '\+\+\+', '\^',
    '\|', '\$\$', '££', '€€', '¥¥', '%%', '&&', '\*', '!!',
    '~', ';', ':', '¬', '`', 'œ', '♪', '╞', '╢', '╪', '╟', 
    '▌', '┴', '╩', '│', '']

# Create the regex pattern
regex = '|'.join(strings_to_remove)

df = df[~df['sentence'].str.contains(fr'{regex}', regex=True, na=False)]

In [50]:
df.count()

sentence               212090
translated_sentence    212090
dtype: int64

In [51]:
# Remove any sentences that contain three or more unique strings of numbers. This is fairly
# restrictive

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is three or more
df = df[df['number_count'] < 3]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [52]:
df.count()

sentence               212024
translated_sentence    212024
dtype: int64

In [53]:
# Remove any sentences that don't start with an allowed character

# Define the list of allowed starting characters
allowed_chars = {
    'fr': '[a-zA-ZéèêëÉÈÊËàâäÀÂÄôöÔÖûüÛÜçÇîÎïÏ]',
    'de': '[a-zA-ZäöüÄÖÜß]',
    'ru': '[А-Яа-яЁё]',
    'th': ''
    }[constants.language_code]

if constants.language_code not in ['th']: # Not applicable for thai

    pattern = f'^[^{allowed_chars}]'

    # Use pattern to remove sentences that don't start with the allowed characters
    df = df[~df['sentence'].str.match(pattern)]

In [54]:
# Remove sentences starting with a space or -. Not sure why these weren't removed before
df = df[~df['sentence'].str.startswith((' ', '-', '—'))]
df = df[~df['translated_sentence'].str.startswith((' ', '-', '—'))]

In [55]:
# In russian and thai, remove any sentences that contain roman characters
# This will miss any accented characters, which is a potential issue
if constants.language_code in ['ru', 'th']:
    df = df[~df['sentence'].str.contains('[a-zA-Z]', regex=True)]

In [56]:
# Remove any sentences where there's an uneven number of quotation marks

# Define the quotation marks
quotation_marks = [
    "'", '"', '`', '«', '»', '‘', '’', '“', '”', '„', '‚', '「',
    '」', '『', '』', '《', '》', '〈', '〉', '‚', '‛', '„', '‟',
    '‘', '’', '“', '”', '‹', '›', '«', '»', '｢', '｣', '｡', '｠',
    '｟', '｠', '′', '″'
    ]

def has_uneven_quotation_marks(sentence):
    for mark in quotation_marks:
        if sentence.count(mark) % 2 != 0:
            return True
    return False

df = df[~df['sentence'].apply(has_uneven_quotation_marks)]


In [57]:
# Remove id column and save dataframes as csv
df.to_csv(f"../output_files/{constants.language_code}/step3_sentences.csv", sep='\t', index=False)