In [8]:
import pandas as pd

In [9]:
def load_stop_words(file_path):
    """
    Load stop words from a file and return them as a set.

    :param file_path: Path to the stop words file.
    :return: A set of stop words.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            stop_words = set(file.read().splitlines())
        return stop_words
    except Exception as e:
        print(f"Error loading stop words: {e}")
        return set()

In [10]:
print("Loading stop words...")
stop_words = load_stop_words('../data/vietnamese-stopwords.txt')
if not stop_words:
    print("No stop words loaded. Please check the file path and content.")
print(f"Loaded {len(stop_words)} stop words.")

Loading stop words...
Loaded 1909 stop words.


In [11]:
import re

def remove_stopwords(text, stop_words):
    """
    Remove stop words from a given text, prioritizing longer stop words first.

    :param text: The input text from which to remove stop words.
    :param stop_words: A set or list of stop words to remove.
    :return: The text with stop words removed.
    """
    if not isinstance(text, str):
        return text

    # Sort stopwords by length descending
    sorted_stops = sorted(stop_words, key=lambda w: -len(w))

    # Build regex pattern (whole words, word boundaries)
    pattern = r'\b(?:' + '|'.join(re.escape(w) for w in sorted_stops) + r')\b'

    # Replace matched stopwords with empty string
    cleaned = re.sub(pattern, '', text)

    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    # If the cleaned text is empty, return the original text
    return cleaned if cleaned else text


In [12]:
# Remove stop words from a sample text
sample_text = "đây là một ví dụ về việc loại bỏ các từ dừng trong văn bản bước khỏi đèo hải vân bên cạnh đó là bán đảo sơn trà"
cleaned_text = remove_stopwords(sample_text, stop_words)
print(f"Original text: {sample_text}")
print(f"Cleaned text: {cleaned_text}")

Original text: đây là một ví dụ về việc loại bỏ các từ dừng trong văn bản bước khỏi đèo hải vân bên cạnh đó là bán đảo sơn trà
Cleaned text: ví dụ dừng văn bản đèo hải vân đảo sơn trà


In [13]:
from tqdm import tqdm

# Apply the function to a DataFrame column of non_sw with tqdm progress bar
df = pd.read_csv('../data/naive-bayes-dataset/test.csv')
tqdm.pandas()
df['nonsw'] = df['comment'].progress_apply(lambda x: remove_stopwords(x, stop_words) if isinstance(x, str) else x)
# Save the cleaned DataFrame to adf new CSV file
df.to_csv('../data/naive-bayes-dataset/test-nonsw.csv', index=False)

# Apply the function to a DataFrame column of non_sw with tqdm progress bar
df = pd.read_csv('../data/naive-bayes-dataset/train.csv')
tqdm.pandas()
df['nonsw'] = df['comment'].progress_apply(lambda x: remove_stopwords(x, stop_words) if isinstance(x, str) else x)
# Save the cleaned DataFrame to adf new CSV file
df.to_csv('../data/naive-bayes-dataset/train-nonsw.csv', index=False)

100%|██████████| 800/800 [00:01<00:00, 570.74it/s]
100%|██████████| 5199/5199 [00:09<00:00, 546.00it/s]


In [14]:
nonswdf = pd.read_csv('../data/naive-bayes-dataset/test-nonsw.csv')
# Number of empty nonsw comments
empty_nonsw_count = nonswdf['nonsw'].isnull().sum()
print(f"Number of empty nonsw comments: {empty_nonsw_count}")
# Print those numbers
print("Empty nonsw comments row indexes:")
print(nonswdf[nonswdf['nonsw'].isnull()].index.tolist())

nonswdf = pd.read_csv('../data/naive-bayes-dataset/train-nonsw.csv')
# Number of empty nonsw comments
empty_nonsw_count = nonswdf['nonsw'].isnull().sum()
print(f"Number of empty nonsw comments: {empty_nonsw_count}")
# Print those numbers
print("Empty nonsw comments row indexes:")
print(nonswdf[nonswdf['nonsw'].isnull()].index.tolist())

Number of empty nonsw comments: 0
Empty nonsw comments row indexes:
[]
Number of empty nonsw comments: 0
Empty nonsw comments row indexes:
[]
