In [1]:
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from indoNLP.preprocessing import pipeline, remove_stopwords, replace_slang, replace_word_elongation

nlppipe = pipeline([replace_slang, replace_word_elongation, remove_stopwords])

# Function to remove stop words
def remove_stop_words(text, stopword_remover):
    return stopword_remover.remove(text)

# Function to perform stemming
def stem_text(text, stemmer):
    return stemmer.stem(text)

# Function to remove noise
def remove_noise(text):
    text = re.sub(r'[^\w\s]', '', text)  # Removes emojis and special characters
    # Add other noise removal processes here if needed
    return text

# Function to lower case
def lower_case(text):
    return text.lower()

# Function to remove extra spaces
def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# Function to delete repeated characters
# Delete repeated characters, that have more than 2 consecutive occurences
def delete_repeated_characters(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

# Function to anonymize PII
def anonymize_pii(text):
    # Define a dictionary with names as keys and replacements as values
    replacements = {
        "rengga": "assistant",
        "yerry soepriyanto": "lecturer",
        "yerry": "lecturer",
    }

    # Replace each occurrence of the keys in the text with their corresponding values
    for name, replacement in replacements.items():
        text = text.replace(name, replacement)

    # Add other PII anonymization processes here if needed
    return text

# Create a stop word remover and stemmer for Indonesian language
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

# Function to preprocess a message
def preprocess_message(message, stopword_remover, stemmer):
    message = remove_noise(message)
    message = lower_case(message)
    message = nlppipe(message)
    message = remove_extra_spaces(message)
    message = anonymize_pii(message)
    # message = delete_repeated_characters(message)
    # message = stem_text(message, stemmer) # REASON: The stemming results are to obfuscated the real meaning
    return message

In [2]:
# Load the dataset
file_path = './data/survey_raw.csv'  # Replace with your file path
survey_data = pd.read_csv(file_path)
# Ensure all entries in 'message' are strings
survey_data['message'] = survey_data['message'].astype(str)
# show the first 5 rows of the dataset
survey_data.head()

Unnamed: 0,id,created_at,message,user_id,topic_id
0,d616e143-aa57-404e-ad12-ee788062c8a5,2023-10-24 06:48:04.698104+00,Sedikit cerita mengenai apa yang saya rasakan ...,6d1677af-d455-4691-98c6-6458766c0a9a,4a20c13b-37bf-4f71-80fd-1562ade45678
1,ef462e6b-626f-4871-94cc-807906e32938,2023-10-24 09:14:45.674619+00,"Secara umum, saya sangat senang mempelajari ma...",31ef148a-fcd4-4d3e-8b2f-6d31f6191d8f,4a20c13b-37bf-4f71-80fd-1562adead5c5
2,e703d136-3296-45b7-b12c-cfb93e892a43,2023-10-24 09:16:01.029697+00,"Secara umum, saya sangat senang mempelajari ma...",31ef148a-fcd4-4d3e-8b2f-6d31f6191d8f,4a20c13b-37bf-4f71-80fd-1562ade45678
3,f4044fa4-a245-42cb-95fd-bf38bf042919,2023-10-24 12:19:43.356449+00,"Dalam mempelajari materi ini terlihat seru, me...",b8817bf3-79af-480c-8e28-a60c8f6fd890,4a20c13b-37bf-4f71-80fd-1562ade45678
4,c62c0a22-65fa-4f5e-8dab-b8cf0bd4ac3d,2023-10-24 13:28:20.087846+00,"Pusing belajar blender, Ya Allah ... bismillah...",3b420f30-3a87-4b7c-9ec3-80353693b7ba,4a20c13b-37bf-4f71-80fd-1562adead5c5


In [3]:
# Apply preprocessing
survey_data['processed_message'] = survey_data['message'].apply(lambda x: preprocess_message(x, stopword_remover, stemmer))
survey_data[['processed_message']].head()

Unnamed: 0,processed_message
0,cerita rasakan tugas dimensi pesimis tugas san...
1,senang mempelajari materi 3d materi menarik be...
2,senang mempelajari materi 3d materi menarik be...
3,mempelajari materi seru diawal kesusahan basic...
4,pusing belajar blender ya allah bismillah lanc...


In [4]:
# user_id represents the user who sent the message
# we need to anonymize this column but we need to keep the mapping
# we can use number 1 to 150 as the replacement
# we can store the mapping in a dictionary
user_id_mapping = {}
for i, user_id in enumerate(survey_data['user_id'].unique()):
    # maximum user id is three digit, when the user id is less than 3 digit, we need to add leading zero
    user_id_mapping[user_id] = f'{i+1:03}'


# Apply the mapping
survey_data['user_id'] = survey_data['user_id'].apply(lambda x: user_id_mapping[x])

# Print the first 5 rows of the processed data
survey_data.head()

Unnamed: 0,id,created_at,message,user_id,topic_id,processed_message
0,d616e143-aa57-404e-ad12-ee788062c8a5,2023-10-24 06:48:04.698104+00,Sedikit cerita mengenai apa yang saya rasakan ...,1,4a20c13b-37bf-4f71-80fd-1562ade45678,cerita rasakan tugas dimensi pesimis tugas san...
1,ef462e6b-626f-4871-94cc-807906e32938,2023-10-24 09:14:45.674619+00,"Secara umum, saya sangat senang mempelajari ma...",2,4a20c13b-37bf-4f71-80fd-1562adead5c5,senang mempelajari materi 3d materi menarik be...
2,e703d136-3296-45b7-b12c-cfb93e892a43,2023-10-24 09:16:01.029697+00,"Secara umum, saya sangat senang mempelajari ma...",2,4a20c13b-37bf-4f71-80fd-1562ade45678,senang mempelajari materi 3d materi menarik be...
3,f4044fa4-a245-42cb-95fd-bf38bf042919,2023-10-24 12:19:43.356449+00,"Dalam mempelajari materi ini terlihat seru, me...",3,4a20c13b-37bf-4f71-80fd-1562ade45678,mempelajari materi seru diawal kesusahan basic...
4,c62c0a22-65fa-4f5e-8dab-b8cf0bd4ac3d,2023-10-24 13:28:20.087846+00,"Pusing belajar blender, Ya Allah ... bismillah...",4,4a20c13b-37bf-4f71-80fd-1562adead5c5,pusing belajar blender ya allah bismillah lanc...


In [5]:
# Change the value of topic_id based on the value of topic name in the topic.csv
topic_data = pd.read_csv('./data/topic.csv')
topic_data = topic_data[['id', 'topic_name']]
topic_data = topic_data.set_index('topic_name')
topic_data = topic_data.to_dict()['id']
topic_data = {v: k for k, v in topic_data.items()}
survey_data['topic_id'] = survey_data['topic_id'].apply(lambda x: topic_data[x])

# Change the value of created_at to dd-mm-yyyy precision
survey_data['created_at'] = survey_data['created_at'].apply(lambda x: x.split(' ')[0])

In [6]:
# Drop the 'message' column & id column
survey_data = survey_data.drop(columns=['message', 'id'])

# when the processed_message character length is below 60, drop the row
survey_data = survey_data[survey_data['processed_message'].apply(lambda x: len(x) >= 70)]

# Save the processed data to new csv file
survey_data.to_csv('./data/survey_clean.csv', index=False)