In [1]:
!pip install Sastrawi



In [2]:
import pandas as pd 
import re  
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# Tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
# Imbalance data
from imblearn.over_sampling import SMOTE
from collections import Counter
# StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Split data
from sklearn.model_selection import train_test_split
# Model Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# WordCloud
from wordcloud import WordCloud

In [10]:
df = pd.read_csv("Hasil_Preprocessing_Pelabelan_MarApr-fix.csv", usecols=["Data_Preprocessing", "Polarity"])
df.columns = ["Tweet", "Kelas"]
pd.set_option('max_colwidth',1)

df.head(10)

Unnamed: 0,Tweet,Kelas
0,"['kira', 'total', 'gantung', 'iya', 'nder', 'tinggal', 'kota', 'jabodetabek', 'daerang', 'jakarta', 'tangerang', 'iya', 'ngumpulin', 'uang', 'tiket', 'nya', 'transportasi', 'venue', 'gampang', 'mhal', 'nya', 'jangkau', 'krl', 'transjakarta']",Negatif
1,"['pagi', 'kak', 'terima', 'kasih', 'informasi', 'lapor', 'laku', 'evaluasi', 'terima', 'kasih', 'rsn']",Positif
2,"['hai', 'kak', 'beli', 'kartu', 'jak', 'lingko', 'kartu', 'bni', 'mandiri', 'bri', 'beli', 'vending', 'machine', 'sedia', 'haltehalte', 'transjakarta', 'harga', 'rp', 'saldo', 'rp', 'terima', 'kasih', 'ra']",Positif
3,"['transjakarta', 'tolong', 'admin', 'bus', 'tipe', 'ppd', 'tugas', 'dalam', 'pagi', 'koridor', 'tipe', 'ppd', 'kondisi', 'penuh', 'hamil', 'duduk', 'koridor', 'tipe', 'ppd', 'tugas', 'dalam', 'bus']",Negatif
4,"['transjakarta', 'selamat', 'pagi', 'beli', 'kartu', 'jaklingko', 'mana', 'harga', 'iya', 'terima', 'kasih']",Positif
5,"['transjakarta', 'ok', 'admin', 'pantau', 'iya', 'terima', 'kasih']",Positif
6,"['transjakarta', 'pgc', 'rute', 'gimna', 'admin']",Negatif
7,"['pemprov', 'dki', 'jakarta', 'rencana', 'lelang', 'aset', 'bus', 'transjakarta', 'bengkalai', 'dishub', 'proses', 'hapus', 'aset']",Negatif
8,"['transjakarta', 'porsi', 'beda', 'dari', 'sabtu', 'minggu', 'tanggal', 'merah', 'cmn', 'armada']",Negatif
9,"['pagi', 'kak', 'sedia', 'rute', 'summarecon', 'bekas', 'pancoran', 'tugu', 'rute', 'bekas', 'timur', 'cawang', 'uki', 'informasi', 'rute', 'sila', 'lihat', 'website', 'resmi', 'transjakarta', 'halaman', 'peta', 'rute', 'aplikasi', 'tije', 'terima', 'kasih', 'ra']",Negatif


In [11]:
df.shape

(5049, 2)

In [12]:
df.isnull().sum()

Tweet    0
Kelas    0
dtype: int64

In [14]:
X = df['Tweet']
y = df['Kelas']

In [21]:
f=open('Dictionary/StopWord/id.stopwords.02.01.2016.txt')
ina_stopword = f.read()
ina_stopword = ina_stopword.split('\n')
ina_stopword[:5]

['ada', 'adalah', 'adanya', 'adapun', 'agak']

In [22]:
# Inisialisasi objek stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Pra-pemrosesan teks
komentar_preprocessed = []

for i in range(len(X)):
    try:
        # Menghapus karakter non-alphabet
        review = re.sub('[^a-zA-Z]', ' ', str(X[i]))
        review = review.lower()

        # Memisahkan kata-kata menjadi token
        review = review.split()

        # Menghapus stopwords
        review = [word for word in review if not word in set(ina_stopword)]

        # Melakukan stemming pada kata-kata
        review_stemmed = [stemmer.stem(word) for word in review]

        # Menggabungkan kata-kata yang telah diproses kembali menjadi kalimat
        review_preprocessed = ' '.join(review_stemmed)

        # Menambahkan kalimat yang telah diproses ke dalam list komentar_preprocessed
        komentar_preprocessed.append(review_preprocessed)
    except TypeError:
        print("Data pada X[", i, "] bukan string atau bytes-like object.")

In [23]:
tfidf = TfidfVectorizer(max_features=10000, stop_words=ina_stopword, ngram_range=(1,3), min_df=2, max_df=0.5)
X_tf_idf = tfidf.fit_transform(komentar_preprocessed).toarray()

# Cek jumlah kata
tfidf.get_feature_names_out()[:5]

array(['ab', 'abad', 'abai', 'abang', 'abang admin'], dtype=object)

In [24]:
feature = tfidf.get_feature_names_out()

In [26]:
# ubah tipedata feature menjadi dataframe
Tweet = pd.DataFrame(feature, columns=['Tweet'])

In [27]:
df_0 = pd.concat([Tweet, y], axis=1)

In [28]:
df_0

Unnamed: 0,Tweet,Kelas
0,ab,Negatif
1,abad,Positif
2,abai,Positif
3,abang,Negatif
4,abang admin,Positif
...,...,...
9995,yuk,
9996,zaman,
9997,zhx,
9998,zhx transjakarta,


In [29]:
df_0.isnull().sum()

Tweet    0   
Kelas    4951
dtype: int64

In [30]:
df_0.dropna(inplace=True)

In [31]:
df_0.isnull().sum()

Tweet    0
Kelas    0
dtype: int64

In [32]:
df_0.to_csv('Untuk_Manual.csv', encoding='utf-8')

In [33]:
data = pd.read_csv('Untuk_Manual.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Tweet,Kelas
0,0,ab,Negatif
1,1,abad,Positif
2,2,abai,Positif
3,3,abang,Negatif
4,4,abang admin,Positif


In [34]:
pd.set_option('display.max_colwidth', 3000)
positive_tweets = df[df['Kelas'] == 'Positif']
positive_tweets.shape

(2228, 2)

In [36]:
pd.set_option('display.max_colwidth', 3000)
negative_tweets = df[df['Kelas'] == 'Negatif']
negative_tweets.shape

(2821, 2)