<a href="https://colab.research.google.com/github/ulumbagas/Sentimen-Analysis/blob/main/Alun-alun%20Jombang/sentimen_Alun_alun_Jombang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analisis sentimen Alun-alun Jombang

## Import library

In [None]:
#jangan pakai ulasan_clean_stopword coba pakai ulasan_clean
!pip install indoNLP
!pip install nlp-id

In [None]:
import re
import random
import pandas as pd
from tqdm import tqdm
from nlp_id.lemmatizer import Lemmatizer
from nlp_id.stopword import StopWord
from indoNLP.preprocessing import pipeline, replace_word_elongation, replace_slang
from transformers import pipeline as hf_pipeline

pd.set_option('display.max_colwidth', None)

## Load Dataset

In [None]:
path_data='/content/drive/MyDrive/Hugging Face/alun-alun jombang/alun_alun_jombang_reviews_googlemaps.csv'
df = pd.read_csv(path_data).drop_duplicates()
print("Shape dataset:", df.shape)

## Data Cleaning & Preprocessing

## Normalisasi

In [None]:
def clean_review(text: str) -> str:
    """Lowercase, remove emoji/simbol non-ASCII, dan spasi berlebihan"""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.encode("ascii", "ignore").decode("ascii")  # remove emoji/simbol
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)            # keep alphanumeric
    return re.sub(r"\s+", " ", text).strip()

In [None]:
norm = {
    " gk ": " tidak ",
    'aloon ':'alunalun',
    "jd ":"jadi ",
    " ga ": " tidak ",
    " gak ": " tidak ",
    " g ":" tidak ",
    " nggak ": " tidak ",
    " jg ": " juga ",
    " tp ": " tapi ",
    " krn ": " karena ",
    " sm ": " sama ",
    " dg ": " dengan ",
    " dgn ": " dengan ",
    " aja ": " saja ",
    " udh ": " sudah ",
    " blm ": " belum ",
    " skrg ": " sekarang ",
    ' umntuk':' untuk',
    ' krg':' kurang',
    " trs ": " terus ",
    " bgt ": " banget ",
    " bnyk ": " banyak ",
    " tmpt ": " tempat ",
    " kl ": " kalau ",
    " klo ": " kalau ",
    " sy ": " saya ",
    " aq ": " saya ",
    " gue ": " saya ",
    " gua ": " saya ",
    " km ": " kamu ",
    " lu ": " kamu ",
    " ok ": " baik ",
    " oke ": " baik ",
    " sip ": " baik ",
    " d " : " di ",
    "krn ":"karena ",
    " rekomen ": " rekomendasi ",
    " recommended ": " rekomendasi ",
    'many pigeons flying':'banyak merpati terbang',
    "alun alun":"alunalun",
    "yg ": "yang ",
    "skrg ": "sekarang ",
    "alun2 ":"alunalun ",
    "alon2": "alunalun",
    "jalan2 ": "jalan jalan ",
    "spot ":"tempat ",
    "utk ":"untuk ",
    "deket ":"dekat ",
    "enk": "enak ",
    "public place": "tempat umum ",
    "dsb ": "dan sebagainya ",
    'rame ': 'ramai ',
    'krg ': 'kurang ',
    ' unt ': ' untuk ',
    ' tdk ': ' tidak ',
    'anak2 ':'anakanak ',
    'anak-anak':'anakanak',
    'anak anak': 'anakanak',
    ' n ': ' dan ',
    ' sampah2 ': ' sampah ',
    ' dr ': ' dari ',
    ' klo ': ' kalau ',
    ' ayh ': ' ayah ',
    ' dprsiapkan ': ' dipersiapkan ',
    ' orang2 ':' orang orang ',
    'sak jane ': ' sebenarnya ',
    'percis ': 'persis ',
    'sdh ': 'sudah ',
    'org ': 'orang ',
    'wkwk':'',
    'wort it ':'cukup baik ',
    'worth it ':'cukup baik ',
    'poll':'',
    'jbg':'jombang',
    'cepet ':'cepat ',
    'tmpat':'tempat',
    'emg ' : 'memang ',
    'bgt ' : 'sekali ',
    'temen ': 'teman ',
    'banget ': 'sekali  ',
    'bnngeeett': 'sekali',
    'seruuuuu': 'seru',
    'bbrp':'beberapa',
    'icon': 'ikon',
    'happy': 'bahagia',
    'overall': 'secara umum',
    'love u': 'aku suka',
    'weekend': 'akhir minggu',
    'alon alon': 'alunalun',
    'remang2':'remang remang',
    'temen2':'teman teman',
    'ngga ': 'tidak ',
    'enak2': 'enak',
    'kota2': 'kota',
    'play ground':"tempat bermain",
    'playgrond': "tempat bermain",
    'play ground': "tempat bermain",
    'playground':'tempat bermain',
    'bareng': 'bersama',
    'puaaanas': 'panas',
    'nyangkruk': 'berkumpul',
    'jombanng':'jombang',
    'sore2':'sore',
    ' jl ': ' jalan ',
    'hangout': 'jalan-jalan bersama',
    'laper ': 'lapar ',
    'enggak ': 'tidak ',
    ' city ':' kota ',
    'ruame ':'ramai ',
    'rame':'ramai',
    'makananx ': 'makanannya ',
    'free ':'gratis ',
    'entrance ': 'masuk ',
    'mkanan ': 'makanan ',
    'bagus2':'bagus',
    'alun alunnya':'alunalun',
    'alun2nya':'alunalun',
    'first waktu':'pertama kali',
    'alun2nya':'alunalun',
    'tpat ':'tempat ',
    'cangtip':'cantik',
    'indak ':'tidak ',
    'kumpul2':'berkumpul',
    'tyap ':'setiap ',
    'alun alunya':'alunalun',
    'taman2 ':'taman ',
    'alunalunya ':'alunalun ',
    'buangeeetttt':'sekali',
    'panaaaaaaassss':'panas',
    'santai2 ':'santai ',
    'pagi2 ':'pagi ',
    'nice place':'tempat bagus',
    'jooos men':'bagus sekali',
    'sebrang ':'seberang',
    'nganter ':'mengantar ',
    'apik ':'bagus ',
    'car free day':'hari bebas kendaraan bermotor',
    'alun2':'alunalun',
    'momong':'mengasuh',
    'asyk ':'asyik ',
    'pusat2':'pusat',
    'panas2 ':'panas ',
    'jauh2 ':'jauh',
    'teleknya ':'kotorannya ',
    'hbos ': 'habis ',
    'hlan halan ':'jalan jalan ',
    'keceh ':'keren ',
    'jomabng ':'jombang ',
    'pisan ':'sekali ',
    'unuk ':'untuk ',
    ' mbuat ':' membuat ',
    " mjadi ":" menjadi ",
    'baguus': 'bagus',
    'kids friendly':'ramah anak',
    'child friendly':'ramah anak',
    " layan ":'',
    'bocil':'anak',
    'jualana':'jualan',
    'banwa':'membawa',
    'smpah ':'sampah ',
    'smbarangan ':'sembarangan ',
    'ngemong ':'mengasuh',
    'playdtound':'taman bermain',
    'playgorund ':'taman bermain ',
    'malming ':'malam minggu ',
    'moga ':'semoga ',
    'nyantai ':'bersantai ',
    'xlo ':'kalau ',
    'quality time':'waktu yang berkualitas',
    'berhati2': 'hati-hati',
    'direnov ':'direnovasi',
    'family':'keluarga',
    'wig end':'akhir minggu',
    'week end':'akhir minggu',
    'nice ':'bagus ',
    'rekomended ':'Direkomendasikan ',
    'brtugas ':'bertugas ',
    'kalo ':'kalau ',
    'pulkam ':'pulang kampung '

}

def normalisai(text: str, norm_dict: dict) -> str:
    for k, v in norm_dict.items():
        text = text.replace(k, v)
    return text

In [None]:
# Normalisasi slang dari CSV GitHub
slang_url='https://raw.githubusercontent.com/adeariniputri/text-preprocesing/master/slang.csv'

slang = pd.read_csv(slang_url)
slang_dict = dict(zip(" " + slang["slang"] + " ", " " + slang["formal"] + " "))


In [None]:
# Word elongation & slang indoNLP
def normalize_slang_elong(text: str) -> str:
    pipe = pipeline([replace_word_elongation, replace_slang])
    return pipe(text)

In [None]:
# =======================
# Preprocessing Pipeline
# =======================
df["ulasan_clean"] = (
    df["ulasan"]
    .apply(clean_review)
    .apply(lambda x: normalisai(x, norm))
    .apply(lambda x: normalisai(x, slang_dict))
    .apply(normalize_slang_elong)
)

In [None]:
#random cek
import random
random_number = random.randint(1, len(df))
df[['ulasan','ulasan_clean']][random_number:(random_number+50)]

## Steaming / lemmatizer

In [None]:

lemmatizer = Lemmatizer()
def stemming(text: str) -> str:
    return lemmatizer.lemmatize(text)

## Stop word

In [None]:
stopword = StopWord()
stopwords_list = stopword.get_stopword()
hapus_stopword =['satu']
for kata in hapus_stopword:
    stopwords_list.remove(kata)

def remove_stopwords(text: str) -> str:
  tokens=text.split()
  tokens=[word for word in tokens if word not in stopwords_list]
  return ' '.join(tokens)

In [None]:
df["ulasan_clean_stopword"] = df["ulasan_clean"].apply(remove_stopwords)
df["ulasan_stemming"] = df["ulasan_clean_stopword"].apply(stemming)


## Sentimen

## w11wo model

In [None]:
from transformers import pipeline
pretrained_name = "w11wo/indonesian-roberta-base-sentiment-classifier"

sentimen_w11wo = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)

In [None]:
def w11wo_sentimen(text: str) -> str:
  if not isinstance(text, str) or text.strip() == "":
        return "unknown"
  sentimen=sentimen_w11wo(text[:512])[0]
  return sentimen["label"]


In [None]:
df['w11wo_model_sentimen']=df['ulasan_stemming'].apply(w11wo_sentimen)
df['w11wo_model_sentimen'].value_counts()

In [None]:
df = df.dropna()

In [None]:
filepath='/content/drive/MyDrive/Hugging Face/alun-alun jombang/review_clean.csv'

df.to_csv(filepath,index=False)

In [None]:
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from collections import Counter
import nltk
from nltk import ngrams
import numpy as np

In [None]:
df_sentiment=df[['ulasan_clean','ulasan_stemming','w11wo_model_sentimen']]
df_sentiment['ulasan_stemming'] = df_sentiment['ulasan_stemming'].str.replace('anakanak', 'anak-anak')
df_sentiment['ulasan_stemming'] = df_sentiment['ulasan_stemming'].str.replace('anakanaknya', 'anak-anaknya')
df_sentiment['ulasan_stemming'] = df_sentiment['ulasan_stemming'].str.replace('alunalun', 'alun-alun')
df_sentiment.head()

In [None]:
def get_reviews_by_sentiment(df,table:str,table_clean:str ,sentimen: str):
    return df[df[table] == sentimen][table_clean]

In [None]:
sentimen_labels = df['w11wo_model_sentimen'].unique()
reviews_by_sentiment = {
    sentimen: get_reviews_by_sentiment(df_sentiment,'w11wo_model_sentimen','ulasan_stemming', sentimen)
    for sentimen in sentimen_labels
}


positif_review = reviews_by_sentiment['positive']
negatif_review = reviews_by_sentiment['negative']
neutral_review = reviews_by_sentiment['neutral']

In [None]:
positive_text = " ".join(str(review) for review in positif_review)
negatif_text = " ".join(str(review) for review in negatif_review)
neutral_text = " ".join(str(review) for review in neutral_review)

In [None]:
def generate_wordcloud(data, title):
    cloud = WordCloud(width=1000,
                      height=400,
                      max_words=500,
                      colormap='viridis',
                      background_color='white',
                      collocations=False

                      ).generate_from_text(data)
    plt.figure(figsize=(10,8))
    plt.imshow(cloud)
    plt.axis('off')
    plt.title(title, fontsize=13)
    plt.show()

def word_freq(data, title):
    data = data.split()

    word_freq = Counter(data)
    # ubah jadi DataFrame agar mudah dianalisis
    freq_df = pd.DataFrame(word_freq.items(), columns=["kata", "jumlah"]).sort_values(by="jumlah", ascending=False)

    # tampilkan 10 kata paling sering
    top_n = 5
    top_words = freq_df.head(top_n)

    plt.figure(figsize=(10,6))
    plt.barh(top_words["kata"], top_words["jumlah"], color="skyblue")
    plt.gca().invert_yaxis()  # supaya urutan terbesar di atas
    plt.title(f"{top_n} Kata Paling Sering Muncul dalam Review {title}")
    plt.xlabel("Jumlah Kemunculan")
    plt.ylabel("Kata")
    plt.show()


In [None]:
#dilihat perkata sentimen negatif dan positif terkesan sama, mari kita lihat 3 kata (trigram) -->chat gpt

generate_wordcloud(positive_text,'Wordcloud Positif')

print('')

word_freq(positive_text,'Positif')

In [None]:
generate_wordcloud(negatif_text,'Wordcloud negatif')
print('')
word_freq(negatif_text,'negatif')

In [None]:
generate_wordcloud(neutral_text,'Wordcloud neutral')

In [None]:
def n_gram(text_data, n): # Modified to accept text_data as input
    n_grams = ngrams(text_data.split(), n) # Use text_data instead of global text
    return n_grams

In [None]:
sentences = []
df_frequency = []
# Function frequency: counts the frequency of the each "n-gram" output
# Parameter grammed: it takes the function "n_gram"s return as value. Briefly, the grammed text.
def frequency(grammed):
    sentences.clear()
    df_frequency.clear()
    freq = nltk.FreqDist(grammed)
    for k, v in freq.items():
        sentences.append(k)                     # Sentences is a list, stores the grams(ignores duplicates)
        df_frequency.append(v)                  # df_frequency is a list, stores the frequency of grams

In [None]:
# Function create_plot: it creates a plot for given grams
# Parameter num: it's a number to send it to "n_gram" function
# Parameter text_data: the combined text data to analyze
def create_plot(num, text_data): # Modified to accept text_data
    frequency(n_gram(text_data, num)) # Send text_data and num parameter to "n_gram func." and send the result to "frequency func."

    gram_frame = pd.DataFrame(sentences)       # gram_frame is the data frame to store grams and freq.

    gram_frame['frequencies'] = df_frequency
    if num == 2:
        gram_frame.columns = ['first', 'second', 'frequencies']
    if num == 3:
        gram_frame.columns = ['first', 'second', 'third', 'frequencies']

    gram_frame.sort_values("frequencies", axis=0, ascending=False, inplace=True, na_position='last')

    gram_frame = gram_frame.head(20)            # Only take the top 20 of gram_frame

    total = sum(df_frequency)

    gram_frame["ratio"] = gram_frame['frequencies'].div(total)   # Additional, ratio is added

    plt.rcdefaults()
    fig, ax = plt.subplots()

    if num == 2:
        grams = gram_frame["first"] + " " + gram_frame["second"]
    if num == 3:
        grams = gram_frame["first"] + " " + gram_frame["second"] + " " + gram_frame["third"]

    # Create plot
    y_pos = np.arange(len(grams))
    performance = gram_frame["frequencies"]

    ax.barh(y_pos, performance)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(grams)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Frequency')
    ax.set_title(f'{num}-grams') # Changed title to be more informative

    plt.show()
    display(gram_frame)

In [None]:
create_plot(3, positive_text) # Pass positive_text to create_plot

In [None]:
create_plot(3, negatif_text)

In [None]:
# Instalasi BERTopic
!pip install bertopic

# Instalasi sentence-transformers (jika belum ada)
!pip install sentence-transformers

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
topic_model = BERTopic(embedding_model=embedding_model,language='indonesian')
topics, probs = topic_model.fit_transform(df_sentiment['ulasan_stemming'])

In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head()

In [None]:
topic_info['Representation'][:8]

In [None]:
aspects=['area','main','jombang','alunalun','fasilitas','parkir','keluarga','sampah','bersih','tempat','suasana','lokasi','kota']

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

from transformers import pipeline
pipe = pipeline("text-classification", model=model,tokenizer=tokenizer)

In [None]:
def extract_aspects(text):
    found = [a for a in aspects if a in text.lower()]
    return found if found else []


In [None]:
def get_aspect_sentiment(text):
    aspek_ditemukan = extract_aspects(text)
    hasil = []

    for asp in aspek_ditemukan:
        sent = pipe(text, text_pair=asp)
        hasil.append({
            "aspect": asp,
            "sentiment": sent[0]["label"]
        })
    return hasil


In [None]:
from tqdm import tqdm
tqdm.pandas()
df_sentiment["aspect_sentiment"] = df_sentiment["ulasan_stemming"].progress_apply(get_aspect_sentiment)


In [None]:
df_sentiment.head()

In [None]:
# expand hasil analisis jadi baris terpisah
rows = []
for i, row in df_sentiment.iterrows():
    for asp in row["aspect_sentiment"]:
        rows.append({
            "ulasan_clean": row["ulasan_clean"],
            "ulasan_stemming": row["ulasan_stemming"],
            "w11wo_model_sentimen": row["w11wo_model_sentimen"],
            "aspect": asp["aspect"],
            "sentiment": asp["sentiment"]
        })

df_result = pd.DataFrame(rows)



In [None]:
df_result['ulasan_clean'] = df_result['ulasan_clean'].str.replace('anakanak', 'anak-anak')
df_result['ulasan_clean'] = df_result['ulasan_clean'].str.replace('alunalun', 'alun-alun')

In [None]:
df_result.iloc[1145:1165]

In [None]:
insight = df_result.groupby(["aspect", "sentiment"]).size().reset_index(name="count")
insight.sort_values(["aspect", "count"], ascending=[True, False])
