In [1]:
import pandas as pd
import os
import re
import logging
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from indoNLP.preprocessing import pipeline, remove_stopwords, replace_slang, replace_word_elongation

# NLP Pipeline
nlppipe = pipeline([replace_slang, replace_word_elongation, remove_stopwords])

# Set up logging
logging.basicConfig(level=logging.INFO)

In [2]:
# Function to remove stop words
def remove_stop_words(text, stopword_remover):
    try:
        text = stopword_remover.remove(text)
        logging.info("Stop words removed successfully.")
    except Exception as e:
        logging.error(f"An error occurred during stop word removal: {str(e)}")
        return None
    return text

# Function to perform stemming
def stem_text(text, stemmer):
    try:
        text = stemmer.stem(text)
        logging.info("Text stemmed successfully.")
    except Exception as e:
        logging.error(f"An error occurred during stemming: {str(e)}")
        return None
    return text

# Function to remove noise
def remove_noise(text):
    try:
        text = re.sub(r'[^\w\s]', '', text)  # Removes emojis and special characters
        logging.info("Noise removed successfully.")
    except Exception as e:
        logging.error(f"An error occurred during noise removal: {str(e)}")
        return None
    return text

# Function to lower case
def lower_case(text):
    try:
        text = text.lower()
        logging.info("Text converted to lower case successfully.")
    except Exception as e:
        logging.error(f"An error occurred during lower case conversion: {str(e)}")
        return None
    return text

# Function to remove extra spaces
def remove_extra_spaces(text):
    try:
        text = re.sub(' +', ' ', text)
        logging.info("Extra spaces removed successfully.")
    except Exception as e:
        logging.error(f"An error occurred during extra space removal: {str(e)}")
        return None
    return text

# Function to delete repeated characters
# Delete repeated characters, that have more than 2 consecutive occurences
def delete_repeated_characters(text):
    try:
        text = re.sub(r'(.)\1+', r'\1\1', text)
        logging.info("Repeated characters deleted successfully.")
    except Exception as e:
        logging.error(f"An error occurred during repeated character deletion: {str(e)}")
        return None
    return text

# Function to anonymize PII
def anonymize_pii(text):
    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define a dictionary with names as keys and replacements as values
    replacements = {
        "rengga": "assistant",
        "yerry soepriyanto": "lecturer",
        "yerry": "lecturer",
    }

    try:
        # Replace each occurrence of the keys in the text with their corresponding values
        for name, replacement in replacements.items():
            text = text.replace(name, replacement)
        logging.info("Anonymization completed successfully.")
    except Exception as e:
        logging.error(f"An error occurred during anonymization: {str(e)}")
        return None

    # Add other PII anonymization processes here if needed
    return text

# Create a stop word remover and stemmer for Indonesian language
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

# Function to preprocess a message
def preprocess_message(message, stopword_remover, stemmer):
    try:
        message = remove_noise(message)
        message = lower_case(message)
        message = delete_repeated_characters(message)
        message = anonymize_pii(message)
        message = nlppipe(message)
        message = remove_extra_spaces(message)
    except Exception as e:
        print(f"Error occurred during preprocessing: {str(e)}")
        return None
    return message

In [3]:
# Load the dataset and make sure the column message is string type
# If its ok, show the data head
file_path = './data/survey_raw.csv'

# Check if file exists
if os.path.exists(file_path):
    try:
        # Read the CSV file
        survey_data = pd.read_csv(file_path)

        # Check if 'message' column exists in the DataFrame
        if 'message' in survey_data.columns:
            # Convert 'message' column to string
            survey_data['message'] = survey_data['message'].astype(str)
        else:
            print("Error: 'message' column not found in the data.")

        # Display the first few rows of the DataFrame
        print(survey_data.head())
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: The file could not be parsed as a CSV.")
    except Exception as e:
        print(f"Error: An unexpected error occurred. {str(e)}")
else:
    print(f"Error: The file '{file_path}' does not exist.")

   created_at                                            message  user_id  \
0  2023-10-24  Sedikit cerita mengenai apa yang saya rasakan ...        1   
1  2023-10-24  Secara umum, saya sangat senang mempelajari ma...        2   
2  2023-10-24  Secara umum, saya sangat senang mempelajari ma...        2   
3  2023-10-24  Dalam mempelajari materi ini terlihat seru, me...        3   
4  2023-10-24  Pusing belajar blender, Ya Allah ... bismillah...        4   

   topic_id  
0         0  
1         1  
2         0  
3         0  
4         1  


In [4]:
import sys

try:
    # Apply preprocessing
    logging.info("Applying preprocessing to 'message' column...")
    survey_data["processed_message"] = survey_data["message"].apply(
        lambda x: preprocess_message(x, stopword_remover, stemmer)
    )
except Exception as e:
    logging.error(f"An error occurred during preprocessing: {str(e)}")
    sys.exit(1)

try:
    # Check if 'message' column exists in the DataFrame
    if "message" in survey_data.columns:
        # Drop the 'message' column
        logging.info("Dropping 'message' column...")
        survey_data = survey_data.drop(columns=["message"])
    else:
        logging.warning("'message' column not found in the DataFrame.")
except Exception as e:
    logging.error(f"An error occurred while dropping 'message' column: {str(e)}")
    sys.exit(1)

try:
    # Filter rows where the processed_message character length is below 70
    logging.info("Filtering rows based on 'processed_message' length...")
    survey_data = survey_data[
        survey_data["processed_message"].apply(lambda x: len(x) >= 70)
    ]
except Exception as e:
    logging.error(f"An error occurred during filtering: {str(e)}")
    sys.exit(1)

try:
    survey_data = survey_data.drop_duplicates(subset=["processed_message"])
except Exception as e:
    logging.error(f"An error occurred while dropping duplicates: {str(e)}")
    sys.exit(1)

try:
    # Save the processed data to new csv file
    logging.info("Saving the processed data to a new CSV file...")
    survey_data.to_csv("./data/survey_clean.csv", index=False)
    logging.info("Data saved successfully.")
except Exception as e:
    logging.error(f"An error occurred while saving data: {str(e)}")
    sys.exit(1)

INFO:root:Applying preprocessing to 'message' column...
INFO:root:Noise removed successfully.
INFO:root:Text converted to lower case successfully.
INFO:root:Repeated characters deleted successfully.
INFO:root:Anonymization completed successfully.
INFO:root:Extra spaces removed successfully.
INFO:root:Noise removed successfully.
INFO:root:Text converted to lower case successfully.
INFO:root:Repeated characters deleted successfully.
INFO:root:Anonymization completed successfully.
INFO:root:Extra spaces removed successfully.
INFO:root:Noise removed successfully.
INFO:root:Text converted to lower case successfully.
INFO:root:Repeated characters deleted successfully.
INFO:root:Anonymization completed successfully.
INFO:root:Extra spaces removed successfully.
INFO:root:Noise removed successfully.
INFO:root:Text converted to lower case successfully.
INFO:root:Repeated characters deleted successfully.
INFO:root:Anonymization completed successfully.
INFO:root:Extra spaces removed successfully.


In [5]:
# Stem the processed messages and save them to a new column and save the data to a new csv file
try:
    # Apply stemming
    logging.info("Applying stemming to 'processed_message' column...")
    survey_data["stemmed_message"] = survey_data["processed_message"].apply(
        lambda x: stem_text(x, stemmer)
    )
except Exception as e:
    logging.error(f"An error occurred during stemming: {str(e)}")
    sys.exit(1)

    # drop processed_message column
try:
    logging.info("Dropping 'processed_message' column...")
    survey_data = survey_data.drop(columns=["processed_message"])
except Exception as e:
    logging.error(
        f"An error occurred while dropping 'processed_message' column: {str(e)}"
    )
    sys.exit(1)

# Save the processed data to new csv file
try:
    logging.info("Saving the processed data to a new CSV file...")
    survey_data.to_csv("./data/survey_clean_stemmed.csv", index=False)
    logging.info("Data saved successfully.")
except Exception as e:
    logging.error(f"An error occurred while saving data: {str(e)}")
    sys.exit(1)


# Count the total characters in the stemmed messages

# Create a new column 'char_count' and count the total characters in the stemmed messages
try:
    logging.info("Counting the total characters in the stemmed messages...")
    survey_data["char_count"] = survey_data["stemmed_message"].apply(lambda x: len(x))
except Exception as e:
    logging.error(f"An error occurred while counting characters: {str(e)}")
    sys.exit(1)


INFO:root:Applying stemming to 'processed_message' column...
INFO:root:Text stemmed successfully.


INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
INFO:root:Text stemmed successfully.
I

In [7]:
# Sum the char counts
try:
    logging.info("Summing the char counts...")
    total_chars = survey_data["char_count"].sum()
    logging.info(f"Total characters: {total_chars}")
except Exception as e:
    logging.error(f"An error occurred while summing char counts: {str(e)}")
    sys.exit(1)


INFO:root:Summing the char counts...
INFO:root:Total characters: 68326
