In [1]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import pandas as pd
import re, ast, string, requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## **Load Data**

In [3]:
df = pd.read_csv('stress_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Text,Label
0,"Barang sudah diterima nih kak, makasih yaa",Positive
1,"Gampang dibawa-bawa, terlalu imut ukurannya",Positive
2,LANGGANAN ??????????,Positive
3,"bagus, pengiriman cepet banget bakal jadi lang...",Positive
4,Kartu bekerja dengan baik begitupun sellernya ...,Positive


In [None]:
df.info()

print("\nMissing values:\n", df.isnull().sum())

print("\nData Duplikat:\n", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11795 entries, 0 to 11794
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    11795 non-null  object
 1   Label   11795 non-null  object
dtypes: object(2)
memory usage: 184.4+ KB

Missing values:
 Text     0
Label    0
dtype: int64

Data Duplikat:
 0


**Note:** Data menjadi 11795 karena ada duplikat setelah menyimpan data (dari balancing label data)

##**Preprocessing Text**

In [6]:
def load_slangwords_from_file(filepath):
    slangwords = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                slang, formal = parts
                slangwords[slang.lower()] = formal.lower()
    return slangwords

slangwords = load_slangwords_from_file('kbba.txt')

In [7]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

ind_stopwords = set(stopwords.words('indonesian'))
eng_stopwords = set(stopwords.words('english'))

def load_stopwords_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return set(response.text.strip().splitlines())
        else:
            print(f"Gagal mengakses URL: {url}")
            return set()
    except Exception as e:
        print(f"Error saat mengakses URL stopword: {e}")
        return set()

def load_emotion_words(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            emotion_words = ast.literal_eval(content)
            if isinstance(emotion_words, set):
                return emotion_words
            else:
                raise ValueError("Isi file bukan set Python.")
    except Exception as e:
        print(f"Gagal memuat emotion words: {e}")
        return set()

extra_stopwords_url = 'https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/refs/heads/master/combined_stop_words.txt'

extra_stopwords = load_stopwords_from_url(extra_stopwords_url)
emotion_words = load_emotion_words('kbbe.txt')

custom_stopwords = (ind_stopwords | eng_stopwords | extra_stopwords) - emotion_words

In [8]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ''

    # Cleaning
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # mention
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)  # hashtag
    text = re.sub(r'RT[\s]+', '', text)         # RT
    text = re.sub(r"http\S+", '', text)         # links
    text = re.sub(r'[0-9]+', '', text)          # angka
    text = re.sub(r'[^\w\s]', '', text)         # karakter khusus
    text = text.replace('\n', ' ').strip()

    # Case folding
    text = text.lower()

    # Slangword normalization
    words = text.split()
    fixed_words = [slangwords.get(word, word) for word in words]
    text = ' '.join(fixed_words)

    # Tokenizing
    tokens = word_tokenize(text)

    # Stopword filtering
    filtered_tokens = [word for word in tokens if word not in custom_stopwords and len(word) > 2]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Rejoin
    final_text = ' '.join(stemmed_tokens)

    return final_text

In [9]:
df['text_stemmed'] = df['Text'].apply(preprocess_text)

In [10]:
df.head()

Unnamed: 0,Text,Label,text_stemmed
0,"Barang sudah diterima nih kak, makasih yaa",Positive,barang terima kak terima kasih yaa
1,"Gampang dibawa-bawa, terlalu imut ukurannya",Positive,gampang dibawabawa imut ukur
2,LANGGANAN ??????????,Positive,langgan
3,"bagus, pengiriman cepet banget bakal jadi lang...",Positive,bagus kirim cepat banget langgan
4,Kartu bekerja dengan baik begitupun sellernya ...,Positive,kartu sellernya sigap


In [None]:
df.info()

print("\nMissing values:\n", df.isnull().sum())

print("\nData Duplikat:\n", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11795 entries, 0 to 11794
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          11795 non-null  object
 1   Label         11795 non-null  object
 2   text_stemmed  11795 non-null  object
dtypes: object(3)
memory usage: 276.6+ KB

Missing values:
 Text            0
Label           0
text_stemmed    0
dtype: int64

Data Duplikat:
 0


In [12]:
df.to_csv('final_stress_dataset.csv', index=False)