Now I have a huge database of french sentences (All under 200 chars) categorised into 1400 clusters. I now need to filter down this dataset to a much smaller dataset of cleaned sentences appropriate for use within the app.

I want to filter things like:

Websites and email addresses

Long strings of the same character, such as Awwwwwwww. Filtering three will discard some roman numerals so four or more is what I chose



In [34]:
import os
import re
import pandas as pd

In [35]:
filepath = "./french_sentences_with_cluster_labels.csv"

df = pd.read_csv(filepath)#, delimiter='\t', header=None)

In [36]:
df.count()

id          850171
sentence    850171
cluster     850171
dtype: int64

In [37]:
# Remove any sentences with fewer than 70 characters (max already capped at 200)
df = df[df['sentence'].str.len() >= 70]

In [40]:
df.count()

id          640377
sentence    640377
cluster     640377
dtype: int64

In [41]:
# First get all sentences with three or more of the same roman character in a row
df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df_filtered = df[df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [42]:
df_filtered.count()

id          70
sentence    70
cluster     70
dtype: int64

In [43]:
df_filtered.head()

Unnamed: 0,id,sentence,cluster
2807,3338,"Cher M XXXXXX, CANALSAT lance aujourd'hui une ...",1122
3521,4178,Rougui est passé par là: il a fait un gros tra...,1215
23173,27182,"Attention : ""La BCD"" nécessite que le format d...",126
28796,33661,Mon espérence sur ce peut-être nouveau zelda s...,893
40245,47148,Sachant que « XXXX » est le nom de l'album et ...,156


In [44]:
# Remove these from the dataset
df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]

  df = df[~df['sentence'].str.contains(r'([a-z])\1\1\1', regex=True, na=False, case=False)]


In [45]:
# Remove web addresses and emails
df = df[~df['sentence'].str.contains(r'www\.|@|http://|\\\\|//', regex=True, na=False)]

In [46]:
df.count()

id          638773
sentence    638773
cluster     638773
dtype: int64

In [47]:
# Apart from anticonstitutionnellement and a few others, there are effectively no words over 20 letters in french, so any sentences with a string of characters longer than this should probably be removed.
# This doesn't account for hyphens and apostrophes but words this long might cause wrapping issues in the app anyway.
# 19
df = df[~df['sentence'].str.contains(r'\b\w{20,}\b', regex=True, na=False)]

In [48]:
# Remove any grammatical weirdness
"""
..
''
""
--
==
->
=>
:)
:-)
:(
??
((
))
&#
"""
df = df[~df['sentence'].str.contains(r'\.\.|\'\'|\"\"|--|==|->|=>|:\)|:-\)|:\(|\?\?|\(\(|\)\)|&#', regex=True, na=False)]

In [49]:
df.count()

id          624015
sentence    624015
cluster     624015
dtype: int64

In [50]:
# Remove any sentences that contain three or more unique strings of numbers. This is fairly
# restrictive but the dataset is large so I can afford to be picky

def count_numbers(sentence):
    return len(re.findall(r'\b\d+\b', sentence))

# Apply the function to the 'sentence' column
df['number_count'] = df['sentence'].apply(count_numbers)

# Remove rows where 'number_count' is three or more
df = df[df['number_count'] < 3]

# Optionally, remove the 'number_count' column
df = df.drop(columns=['number_count'])

In [51]:
df.count()

id          583057
sentence    583057
cluster     583057
dtype: int64

In [52]:
# Filter to 100,000 randomly selected sentences with seed
#n_rows = 100_000

#reduced_df = df.sample(n=n_rows, random_state=1)

In [53]:
# Space to try new matches
#df_filtered = df[df['sentence'].str.contains(r'&#', regex=True, na=False)]

In [54]:
#df_filtered.head(50)

In [55]:
#reduced_df.head(50)

In [56]:
# Remove id column and save dataframes as csv
df.to_csv("./french_sentences_cleaned_with_cluster_labels.csv", sep='\t', index=False)

#reduced_df.to_csv("./reduced_french_sentences_cleaned_with_cluster_labels.csv", sep='\t', index=False)