# SAMPLE (Scraping Data)

In [None]:
twitter_auth_token = '90d'

In [None]:
!pip install pandas

# Install Node.js (because tweet-harvest built using Node.js)
!sudo apt-get update
!sudo apt-get install -y ca-certificates curl gnupg
!sudo mkdir -p /etc/apt/keyrings
!curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg

!NODE_MAJOR=20 && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" |sudo tee /etc/apt/sources.list.d/nodesource.list

!sudo apt-get update
!sudo apt-get install nodejs -y

!node -v

In [None]:
# Crawl Data
filename = 'ppn.csv'
search_keyword = 'PPN 12% since:2024-11-1 until:2024-12-31 lang:id'
limit = 5000

!npx -y tweet-harvest@2.6.1 -o "{filename}" -s "{search_keyword}" --tab "LATEST" -l {limit} --token {twitter_auth_token}

In [None]:
import pandas as pd

# Specify the path to your CSV file
file_path = f"tweets-data/{filename}"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path, delimiter=",")

# Display the DataFrame
display(df)

In [None]:
# Cek jumlah data yang didapatkan
num_tweets = len(df)
print(f"Jumlah tweet dalam dataframe adalah {num_tweets}.")

In [None]:
# @title Mulai gabung data
# Install dan import library yang diperlukan
!pip install pandas
!pip install sastrawi
!pip install PySastrawi

import numpy as np
import pandas as pd
import datetime

In [None]:
import pandas as pd

ppn = pd.read_csv('data.csv')
print(ppn.head())

In [None]:
ppn

# Explore (Mendefinisikan data)

In [None]:
ppn.info()

# Modify

In [None]:
# @title Eliminasi Atribut
# membuat file baru yang hanya berisi kolom 'full_text' dan 'labeling'
data = ppn[['full_text', 'labeling']]
data.head()

In [None]:
# @title Eliminasi Data Label Netral
# Menghapus data dengan label 'netral'
data = data[data['labeling'] != 'netral']

# Reset index setelah menghapus data
data = data.reset_index(drop=True)

# Menampilkan data hasil filtering
#print(data)
data

In [None]:
# @title Remove Duplicates
data.drop_duplicates(subset="full_text", keep = 'first', inplace = True)
data

In [None]:
import matplotlib.pyplot as plt

# Hitung jumlah data untuk setiap label
label_counts = data['labeling'].value_counts()
labels = label_counts.index
counts = label_counts.values
percentages = (counts / counts.sum()) * 100
total_data = counts.sum()

# Buat diagram batang
plt.figure(figsize=(8, 6))
bars = plt.bar(labels, counts, color=['skyblue', 'salmon'])

# Tambahkan jumlah & persentase di atas masing-masing batang
for bar, count, percentage in zip(bars, counts, percentages):
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 1,
        f'{count} ({percentage:.1f}%)',
        ha='center',
        va='bottom',
        fontsize=12
    )

# Tambahkan judul dan label sumbu
plt.title('Jumlah Data per Label Sentimen', fontsize=14)
plt.xlabel('Label Sentimen', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)
plt.ylim(0, max(counts) * 1.3)  # beri ruang di atas batang

# Tambahkan teks jumlah total data
plt.figtext(0.5, 0.8, f'Total Data: {total_data}', ha='center', fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
# @title Case Folding
data['case_folding'] = data['full_text'].str.lower()
display(data[['full_text','case_folding']].style.set_sticky())

In [None]:
# @title Cleaning
import string
import re

def cleaning(tweet):
    #remove ascii
    tweet = tweet.encode('ascii', 'replace').decode('ascii')
    #remove angka
    tweet = re.sub('[0-9]+', '', tweet)
    #remove RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    #remove mention, link, hashtag
    tweet = ' '.join(re.sub("([@#_][A-Za-z3-9]+)|(\w+:\/\/\S+)"," ", tweet).split())
    tweet = re.sub('@[^\s]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    #remove url
    tweet = re.sub(r'\w+:\/(Ardy)[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
    #remove tanda baca, kecuali tanda %
    tweet = re.sub(r'[^\w\d\s]+', '', tweet)
    #remove whitespace
    tweet = re.sub('\s+', ' ', tweet)
    return tweet
data['cleaning'] = data['case_folding'].apply(cleaning)
display(data[['case_folding','cleaning']].style.set_sticky())

In [None]:
# @title Tokenize
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['tokenize'] = data['cleaning'].apply(word_tokenize_wrapper)
display(data[['cleaning','tokenize']].style.set_sticky())

In [None]:
# @title Normalize
normalized_word = pd.read_excel(("normalization.xlsx"), engine='openpyxl')
normalized_word_dict = {}
for index, row in normalized_word.iterrows():
    if row.iloc[0] not in normalized_word_dict:
        normalized_word_dict[row.iloc[0]] = row.iloc[1]

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

data['normalize'] = data['tokenize'].apply(normalized_term)
display(data[['tokenize','normalize']].style.set_sticky())

In [None]:
#@title Stopword Removal
import chardet
from nltk.corpus import stopwords
import pandas as pd
import nltk
nltk.download('stopwords')

# Load stopwords bawaan NLTK dan tambahkan stopwords kustom
stop_words = set(stopwords.words('indonesian'))
custom_stopwords = {'wkwk', 'jadi', 'menjadi', 'dapat', 'mendapat', 'iya'}
stop_words.update(custom_stopwords)

# Fungsi menghapus stopwords dari teks
def remove_stopwords(tokens): # Changed text to tokens
    return [word for word in tokens if word.lower() not in stop_words] # Mengembalikan list token yang sudah dibersihkan

data['stopword_removal'] = data['normalize'].apply(remove_stopwords)
data


In [None]:
#@title Stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# function to stem a list of words
def stem_words(words):
  """Stems a list of words, handling potential non-string inputs."""
  stemmed_words = []
  for word in words:
    if isinstance(word, str):
      stemmed_words.append(stemmer.stem(word))
    else:
      stemmed_words.append('')  # Or some other appropriate replacement
  return stemmed_words

# apply stemming to the 'stopword_removal' column
data['stemming'] = data['stopword_removal'].apply(stem_words)
data

In [None]:
# Menghapus baris dengan nilai kosong di kolom 'stemming'
data = data[data['stemming'].notna()]

# Menampilkan data setelah penghapusan data kosong
print(data)

In [None]:
data.to_csv('data_stemming.csv', index=False)

print("File berhasil disimpan sebagai 'hasil_sentimen.csv'")

File berhasil disimpan sebagai 'hasil_sentimen.csv'


# Model

In [None]:
# @title Visualisasi Frekuensi Data (Wordcloud)
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import re

# Filter data untuk label positif
positive_data = data[data['labeling'] == 'positif']

# Gabungkan semua kata dalam kolom 'stemming' untuk label positif
all_positive_words = ' '.join(positive_data['stemming'].astype(str).tolist())

# Preprocessing tambahan untuk membersihkan teks
def clean_text(text):
    # Hapus karakter non-alfanumerik kecuali spasi
    text = re.sub(r"[^a-zA-Z0-9 ]", '', text)
    # Hapus spasi berlebih
    text = re.sub(' +', ' ', text)
    return text

all_positive_words = clean_text(all_positive_words)

# Hitung frekuensi kata
word_counts = Counter(all_positive_words.split())

# Ambil 10 kata dengan frekuensi tertinggi
top_10_words = word_counts.most_common(10)

# Buat DataFrame untuk grafik batang
df_top10 = pd.DataFrame(top_10_words, columns=['Kata', 'Frekuensi'])

# Buat grafik batang
plt.figure(figsize=(10, 5))
plt.bar(df_top10['Kata'], df_top10['Frekuensi'])
plt.title('Top 10 Kata dengan Frekuensi Tertinggi (Label Positif)')
plt.xlabel('Kata')
plt.ylabel('Frekuensi')
plt.xticks(rotation=45, ha='right')  # Rotasi label sumbu x agar mudah dibaca
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

# Filter data untuk label positif
positive_data = data[data['labeling'] == 'positif']

# Gabungkan semua kata dalam kolom 'stemming' untuk label positif
all_positive_words = ' '.join(positive_data['stemming'].astype(str).tolist())

# Preprocessing tambahan untuk membersihkan teks
def clean_text(text):
    # Hapus karakter non-alfanumerik kecuali spasi dan tanda hubung
    text = re.sub(r"[^a-zA-Z0-9 \-]", '', text)  # Mempertahankan tanda hubung
    # Hapus spasi berlebih
    text = re.sub(' +', ' ', text)
    return text

all_positive_words = clean_text(all_positive_words)


# Buat objek WordCloud dengan regexp untuk mendeteksi setiap kata
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      stopwords=STOPWORDS,
                      regexp=r"\w[\w'-]+").generate(all_positive_words)

# Tampilkan wordcloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Sembunyikan sumbu
plt.title('Wordcloud untuk Label Positif')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import re

# Filter data untuk label negatif
negative_data = data[data['labeling'] == 'negatif']

# Gabungkan semua kata dalam kolom 'stemming' untuk label negatif
all_negative_words = ' '.join(negative_data['stemming'].astype(str).tolist())

# Preprocessing tambahan untuk membersihkan teks
def clean_text(text):
    # Hapus karakter non-alfanumerik kecuali spasi
    text = re.sub(r"[^a-zA-Z0-9 ]", '', text)
    # Hapus spasi berlebih
    text = re.sub(' +', ' ', text)
    return text

all_negative_words = clean_text(all_negative_words)

# Hitung frekuensi kata
word_counts = Counter(all_negative_words.split())

# Ambil 10 kata dengan frekuensi tertinggi
top_10_words = word_counts.most_common(10)

# Buat DataFrame untuk grafik batang
df_top10 = pd.DataFrame(top_10_words, columns=['Kata', 'Frekuensi'])

# Buat grafik batang
plt.figure(figsize=(10, 5))
plt.bar(df_top10['Kata'], df_top10['Frekuensi'])
plt.title('Top 10 Kata dengan Frekuensi Tertinggi (Label Negatif)')
plt.xlabel('Kata')
plt.ylabel('Frekuensi')
plt.xticks(rotation=45, ha='right')  # Rotasi label sumbu x agar mudah dibaca
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

# Filter data untuk label negatif
negative_data = data[data['labeling'] == 'negatif']

# Gabungkan semua kata dalam kolom 'stemming' untuk label negatif
all_negative_words = ' '.join(negative_data['stemming'].astype(str).tolist())

# Preprocessing tambahan untuk membersihkan teks
def clean_text(text):
    # Hapus karakter non-alfanumerik kecuali spasi dan tanda hubung
    text = re.sub(r"[^a-zA-Z0-9 \-]", '', text)  # Mempertahankan tanda hubung
    # Hapus spasi berlebih
    text = re.sub(' +', ' ', text)
    return text

all_negative_words = clean_text(all_negative_words)


# Buat objek WordCloud dengan regexp untuk mendeteksi setiap kata
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      stopwords=STOPWORDS,
                      regexp=r"\w[\w'-]+").generate(all_positive_words)

# Tampilkan wordcloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Sembunyikan sumbu
plt.title('Wordcloud untuk Label Negatif')
plt.show()

In [None]:
# @title Pembobotan Kata (TF-IDF)

df=pd.read_csv('data_stemming.csv', usecols=['stemming', 'labeling']).dropna()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tf = TfidfVectorizer()
text_tf = tf.fit_transform(df['stemming'])
text_tf

temporary_df = pd.DataFrame(text_tf.todense(), columns=tf.get_feature_names_out())
temporary_df

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_tf, df['labeling'], test_size=0.1, random_state=42)

print("\nHasil proses TF-IDF (DataFrame):")
print(temporary_df)

# Hitung rata-rata skor TF-IDF setiap kata di seluruh dokumen
average_tfidf = temporary_df.mean().sort_values(ascending=False)

# Buat DataFrame untuk kata dengan rata-rata tertinggi
average_tfidf_df = pd.DataFrame({
    'Kata': average_tfidf.index,
    'Rata-rata TF-IDF': average_tfidf.values
})

# Tampilkan hasil
print("\nKata-kata dengan rata-rata TF-IDF tertinggi:")
print(average_tfidf_df.head(20))  # tampilkan 20 kata teratas

In [None]:
# @title Algoritma Naive Bayes (Split Data)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Gabungkan semua teks dalam kolom 'stemming' menjadi satu string per baris
data['text_stemming'] = data['stemming'].apply(lambda x: ' '.join(x))

# Buat objek TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Ubah teks 'stemming' menjadi vektor numerik
X = vectorizer.fit_transform(data['text_stemming'])
y = data['labeling']

# Tampilkan info awal
print("Jumlah total data:", len(df))
print("Jumlah data Positif:", sum(df['labeling'] == 'positif'))
print("Jumlah data Negatif:", sum(df['labeling'] == 'negatif'))
print("-" * 50)

# Fungsi untuk proses split, pelatihan dan evaluasi
def split_and_train(test_ratio, scenario_name):
    print(f"\n=== Skenario {scenario_name} (test_size={test_ratio}) ===")

    X_train, X_test, y_train, y_test = train_test_split(
        text_tf, df['labeling'], test_size=test_ratio, random_state=42
    )

    print("Jumlah Data Latih :", X_train.shape[0])
    print("Jumlah Data Uji   :", X_test.shape[0])

    # Hitung jumlah positif dan negatif di data latih
    print("\nData Latih:")
    print(" - Positif:", sum(y_train == 'positif'))
    print(" - Negatif:", sum(y_train == 'negatif'))

    # Hitung jumlah positif dan negatif di data uji
    print("\nData Uji:")
    print(" - Positif:", sum(y_test == 'positif'))
    print(" - Negatif:", sum(y_test == 'negatif'))

    # Pelatihan model Naive Bayes
    model = MultinomialNB()
    model.fit(X_train, y_train)

# Jalankan untuk 3 skenario
split_and_train(0.1, "90:10")
split_and_train(0.2, "80:20")
split_and_train(0.3, "70:30")

In [None]:
# @title Split Data Menggunakan SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

# Informasi awal
print("Jumlah total data:", len(data))
print("Jumlah data Positif:", sum(y == 'positif'))
print("Jumlah data Negatif:", sum(y == 'negatif'))
print("-" * 60)

# Fungsi split + SMOTE pada data latih
def split_and_report_with_smote(test_ratio, scenario_name):
    print(f"\n=== Skenario {scenario_name} (test_size={test_ratio}) ===")

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, random_state=42
    )

    print("Sebelum SMOTE (Data Latih):", Counter(y_train))

    # SMOTE pada data latih
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    print("Setelah SMOTE (Data Latih):", Counter(y_resampled))
    print("Jumlah Data Latih (Setelah SMOTE):", X_resampled.shape[0])
    print("Jumlah Data Uji:", X_test.shape[0])
    print(" - Positif:", sum(y_test == 'positif'))
    print(" - Negatif:", sum(y_test == 'negatif'))

    return X_resampled, y_resampled, X_test, y_test

# Jalankan 3 skenario
X90, y90, X90_test, y90_test = split_and_report_with_smote(0.1, "90:10")
X80, y80, X80_test, y80_test = split_and_report_with_smote(0.2, "80:20")
X70, y70, X70_test, y70_test = split_and_report_with_smote(0.3, "70:30")

# Assess

In [None]:
# @title Confusion Matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report)
# Gabungkan semua teks dalam kolom 'stemming' menjadi satu string per baris
data['text_stemming'] = data['stemming'].apply(lambda x: ' '.join(x))

# Buat objek TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# 3. Skenario Pembagian Data
scenarios = [
    (0.1, "90:10"),
    (0.2, "80:20"),
    (0.3, "70:30")
]

# 4. Evaluasi Model
for test_size, scenario_name in scenarios:
    # ========== TANPA SMOTE ==========
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"## [TANPA SMOTE] Skenario: {scenario_name}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-" * 50)

    # ========== DENGAN SMOTE ==========
    # SMOTE hanya diterapkan pada data latih
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    model_smote = MultinomialNB()
    model_smote.fit(X_train_smote, y_train_smote)
    y_pred_smote = model_smote.predict(X_test)

    print(f"## [DENGAN SMOTE] Skenario: {scenario_name}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred_smote, zero_division=0))
    print("=" * 50)