In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U gdown



In [None]:
import pandas as pd
import gdown

file_id = '1-C5WENitBqkZyM0OWlS-XyeVhqepbYpt'
output_filename = 'dataset_clean.csv'

try:
    gdown.download(id=file_id, output=output_filename, quiet=False)
    print(f"File {output_filename} berhasil diunduh!")

    df = pd.read_csv(output_filename)
    print(df.head())

except Exception as e:
    print(f"Gagal mengunduh atau membaca file: {e}")


Downloading...
From: https://drive.google.com/uc?id=1-C5WENitBqkZyM0OWlS-XyeVhqepbYpt
To: /content/dataset_clean.csv
100%|██████████| 3.61M/3.61M [00:00<00:00, 293MB/s]

File dataset_clean.csv berhasil diunduh!
                                               Tweet     Label
0  Kecemasan saya memberitahu saya untuk tidak ju...  Negative
1  Khawatir saya menderita kanker ovarium. Semaki...  Negative
2  Untuk Penderita HA yang memiliki kecemasan yan...  Negative
3  Eye floaters karena stres? Halo, Saya baru-bar...  Negative
4  Ada rasa cemas berlebihan saat ini, .Dan, masi...  Negative





In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10406 entries, 0 to 10405
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   10406 non-null  object
 1   Label   10406 non-null  object
dtypes: object(2)
memory usage: 162.7+ KB


In [None]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


## **Preprocessing Text**

In [None]:
import re, ast, string, requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#### **Load Slangwords**

In [None]:
def load_slangwords_from_file(filepath):
    slangwords = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                slang, formal = parts
                slangwords[slang.lower()] = formal.lower()
    return slangwords

slangwords = load_slangwords_from_file('kbba.txt')

#### **Inisialisasi Stemmer & Stopwords**

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

ind_stopwords = set(stopwords.words('indonesian'))
eng_stopwords = set(stopwords.words('english'))

def load_stopwords_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return set(response.text.strip().splitlines())
        else:
            print(f"Gagal mengakses URL: {url}")
            return set()
    except Exception as e:
        print(f"Error saat mengakses URL stopword: {e}")
        return set()

def load_emotion_words(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            emotion_words = ast.literal_eval(content)
            if isinstance(emotion_words, set):
                return emotion_words
            else:
                raise ValueError("Isi file bukan set Python.")
    except Exception as e:
        print(f"Gagal memuat emotion words: {e}")
        return set()

extra_stopwords_url = 'https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/refs/heads/master/combined_stop_words.txt'

extra_stopwords = load_stopwords_from_url(extra_stopwords_url)
emotion_words = load_emotion_words('kbbe.txt')

custom_stopwords = (ind_stopwords | eng_stopwords | extra_stopwords) - emotion_words

#### **Fungsi Preprocessing**

In [None]:
def preprocess_text(text):
    # Cleaning
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # mention
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)  # hashtag
    text = re.sub(r'RT[\s]+', '', text)         # RT
    text = re.sub(r"http\S+", '', text)         # links
    text = re.sub(r'[0-9]+', '', text)          # angka
    text = re.sub(r'[^\w\s]', '', text)         # karakter khusus
    text = text.replace('\n', ' ').strip()

    # Case folding
    text = text.lower()

    # Slangword normalization
    words = text.split()
    fixed_words = [slangwords.get(word, word) for word in words]
    text = ' '.join(fixed_words)

    # Tokenizing
    tokens = word_tokenize(text)

    # Stopword filtering
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Rejoin
    final_text = ' '.join(stemmed_tokens)

    return final_text

In [None]:
df['text_akhir'] = df['Tweet'].apply(preprocess_text)

In [None]:
df.to_csv('hasil_preprocessing.csv', index=False)

In [None]:
from google.colab import files
files.download('hasil_preprocessing.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df.head()

Unnamed: 0,Tweet,Label,text_akhir
0,Kecemasan saya memberitahu saya untuk tidak ju...,Negative,cemas memberitahu jujurrentan kencan cemas mem...
1,Khawatir saya menderita kanker ovarium. Semaki...,Negative,khawatir derita kanker ovarium media sosial de...
2,Untuk Penderita HA yang memiliki kecemasan yan...,Negative,derita tertawa milik cemas beda takut beda ter...
3,"Eye floaters karena stres? Halo, Saya baru-bar...",Negative,eye floaters stres halo barubaru periksa mata ...
4,"Ada rasa cemas berlebihan saat ini, .Dan, masi...",Negative,rasa cemas
