# PR DAY 24

- Buatlah sebuah model analisis sentiment untuk sebuah film yang baru dirilis.
- Tugas dikumpulkan dalam bentuk PDF, yang di dalamnya meliputi:
  Step by step
  Jawaban atas pertanyaan:
1. Latar belakang pemilihan algoritma yang digunakan
2. Hasil evaluasi
3. Langkah-langkah untuk meningkatkan akurasi model
4. Link repository github (code, dataset, dan model)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
data = pd.read_excel(r'/content/drive/MyDrive/Machine learning AI batch 2/DAY24/assignment/review_172days.xlsx')
data.head()

Unnamed: 0,Text_Tweet,Sentiment
0,172 days bener bener ya definisi abis di naiki...,Netral
1,Udah siap dibikin nangis nonton cerita Amer &a...,Netral
2,172 DAYS udah tayang di Netflix! Mengangkat ki...,Netral
3,AAAAAA SIAPA BUAT CERITA 172 DAYS NI https://t...,Positif
4,emosi nya I tgk 172 days ni,Netral


## Text Processing

### Cleaning Text and Lower Case

In [5]:
def cleaning_text(text):
    # remove url
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)

    # remove mention handle user (@)
    text = re.sub(r'@[\w]*', ' ', text)

    # remove emojis
    emoji_pattern = re.compile(
        '['
        '\U0001F600-\U0001F64F'  # emoticons
        '\U0001F300-\U0001F5FF'  # symbols & pictographs
        '\U0001F680-\U0001F6FF'  # transport & map symbols
        '\U0001F700-\U0001F77F'  # alchemical symbols
        '\U0001F780-\U0001F7FF'  # Geometric Shapes Extended
        '\U0001F800-\U0001F8FF'  # Supplemental Arrows-C
        '\U0001F900-\U0001F9FF'  # Supplemental Symbols and Pictographs
        '\U0001FA00-\U0001FA6F'  # Chess Symbols
        '\U0001FA70-\U0001FAFF'  # Symbols and Pictographs Extended-A
        '\U00002702-\U000027B0'  # Dingbats
        '\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    # remove punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for x in text.lower():
        if x in punctuations:
            text = text.replace(x, " ")

    # remove extra whitespace
    text = ' '.join(text.split())

    # lowercase
    text = text.lower()
    return text

### Remove Stopword

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# CONSTRUCT STOPWORDS
rama_stopword = "https://raw.githubusercontent.com/ramaprakoso/analisis-sentimen/master/kamus/stopword.txt"
yutomo_stopword = "https://raw.githubusercontent.com/yasirutomo/python-sentianalysis-id/master/data/feature_list/stopwordsID.txt"
fpmipa_stopword = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/stopwords-list/fpmipa-stopwords.txt"
sastrawi_stopword = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/stopwords-list/sastrawi-stopwords.txt"
aliakbar_stopword = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/stopwords-list/aliakbars-bilp.txt"
pebahasa_stopword = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/stopwords-list/pebbie-pebahasa.txt"
elang_stopword = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/stopwords-id.txt"
nltk_stopword = stopwords.words('indonesian')

# create path url for each stopword
path_stopwords = [rama_stopword, yutomo_stopword, fpmipa_stopword, sastrawi_stopword,
                  aliakbar_stopword, pebahasa_stopword, elang_stopword]

# combine stopwords
stopwords_l = nltk_stopword
for path in path_stopwords:
    response = requests.get(path)
    stopwords_l += response.text.split('\n')

custom_st = '''
yg yang dgn ane smpai bgt gua gwa si tu ama utk udh btw
ntar lol ttg emg aj aja tll sy sih kalo nya trsa mnrt nih
ma dr ajaa tp akan bs bikin kta pas pdahl bnyak guys abis tnx
bang banget nang mas amat bangettt tjoy hemm haha sllu hrs lanjut
bgtu sbnrnya trjadi bgtu pdhl sm plg skrg
'''

# create dictionary with unique stopword
st_words = set(stopwords_l)
custom_stopword = set(custom_st.split())

# result stopwords
stop_words = st_words | custom_stopword
print(f'Stopwords: {list(stop_words)[:5]}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Stopwords: ['seingat', 'mana', 'jelaskan', 'tersebut', 'sebenarnya']


In [7]:
# remove stopwords
from nltk import word_tokenize, sent_tokenize

def remove_stopword(text, stop_words=stop_words):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

### Stemming / Lemmatization

In [9]:
!pip install sastrawi -q

[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m153.6/209.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m209.7/209.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
# stemming and lemmatization
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming_and_lemmatization(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

###  Tokenization

In [12]:
# tokenization
def tokenize(text):
    return word_tokenize(text)

In [13]:
# example
text = 'Agak Laen ini emang agak lain, ya.. Bisa-bisanya jadi film Indonesia kedua terlaris üôåüèº #respect https://x.com/alnurulg/status/1761301921846140991?s=20'
print(f'Original text: \n{text}\n')

# cleaning text and lowercase
text = cleaning_text(text)
print(f'Cleaned text: \n{text}\n')

# remove stopwords
text = remove_stopword(text)
print(f'Removed stopword: \n{text}\n')

# stemming and lemmatization
text = stemming_and_lemmatization(text)
print(f'Stemmed and lemmatized: \n{text}\n')

# tokenization
text = tokenize(text)
print(f'Tokenized: \n{text}')

Original text: 
Agak Laen ini emang agak lain, ya.. Bisa-bisanya jadi film Indonesia kedua terlaris üôåüèº #respect https://x.com/alnurulg/status/1761301921846140991?s=20

Cleaned text: 
agak laen ini emang agak lain ya bisa bisanya jadi film indonesia kedua terlaris respect

Removed stopword: 
laen bisanya film indonesia terlaris respect

Stemmed and lemmatized: 
laen bisa film indonesia laris respect

Tokenized: 
['laen', 'bisa', 'film', 'indonesia', 'laris', 'respect']


In [16]:
# pipeline preprocess
def preprocess(text):
    # cleaning text and lowercase
    output = cleaning_text(text)

    # remove stopwords
    output = remove_stopword(output)

    # # stemming and lemmatization
    # output = stemming_and_lemmatization(output)

    # # tokenization
    # output = tokenize(output)

    return output

In [18]:
data

Unnamed: 0,full_text,Unnamed: 1
0,172 days bener bener ya definisi abis di naiki...,Netral
1,Udah siap dibikin nangis nonton cerita Amer &a...,Netral
2,172 DAYS udah tayang di Netflix! Mengangkat ki...,Netral
3,AAAAAA SIAPA BUAT CERITA 172 DAYS NI https://t...,Positif
4,emosi nya I tgk 172 days ni,Netral
...,...,...
135,@whosjaygf AWCH salting deh hari ini aku senen...,Netral
136,@sweetchcco ternyata 172 days baguss,Positif
137,kan baru sadar aku rumah di santri pilihan bun...,Netral
138,@sendalkukus gan coba nonton 172 days,Netral


In [23]:
preprocessed_data = data.copy()
preprocessed_data['Text Tweet'] = data['Text_Tweet'].map(preprocess)

In [24]:
preprocessed_data.tail()

Unnamed: 0,Text_Tweet,Sentiment,Text Tweet
135,@whosjaygf AWCH salting deh hari ini aku senen...,Netral,awch salting seneng nonton film smkeluarga kar...
136,@sweetchcco ternyata 172 days baguss,Positif,172 days baguss
137,kan baru sadar aku rumah di santri pilihan bun...,Netral,sadar rumah santri pilihan bunda ck 172 days
138,@sendalkukus gan coba nonton 172 days,Netral,gan coba nonton 172 days
139,@WatchmenID Bukan... 172 days (later) ya Min?,Netral,172 days later min


In [25]:
preprocessed_data['Text Tweet'][0]

'172 days bener bener definisi naikin tingginya dijatuhin sejatuh jatuhnya'

In [29]:
df = preprocessed_data[['Text Tweet', 'Sentiment']]

In [36]:
df.shape

(140, 2)

## LSTM
Dalam model sentiment analysis ini menggunakan model LSTM. LSTM mampu menangani sequence yang panjang dan kompleks, LSTM juga memiliki kemampuan untuk mengingat informasi jangka panjang.

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Assuming the 'Text Tweet' column contains the text data and 'Sentiment' contains labels
texts = df['Text Tweet'].tolist()
labels = df['Sentiment'].tolist()

# Tokenize the text data
max_words = 10000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to make them of equal length
max_sequence_length = 100  # Adjust based on your dataset and sequence length
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to one-hot encoding
labels = pd.get_dummies(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=len(labels.columns), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Model ini memilik akurasi 0.5, bisa dikatakan model masih kurang bagus dalam melakukan prediksi. Hal ini bisa jadi disebabkan oleh data yang digunakan masih terlalu sedikit, dalam processing ini data yang digunakan adalahh 140 data. Keterbatasan data dapat menyebabkan model tidak dapat memprediksi dengan presisi

In [31]:
# Save the model to a file
model.save('lstm_sentiment_model.h5')

# Optionally, save the tokenizer as well for later use during inference
import pickle

with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

  saving_api.save_model(


In [37]:
from keras.models import load_model
import pickle

# Load the model
loaded_model = load_model('lstm_sentiment_model.h5')

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as tokenizer_file:
    loaded_tokenizer = pickle.load(tokenizer_file)

# New sentence for testing
test_sentence = 'nangis banget nontonnya'
new_sentence = preprocess(test_sentence)
# Tokenize and pad the new sentence
new_sequence = loaded_tokenizer.texts_to_sequences([new_sentence])
new_data = pad_sequences(new_sequence, maxlen=max_sequence_length)

# Predict sentiment for the new sentence
predictions = loaded_model.predict(new_data)

# Get the predicted sentiment label
predicted_sentiment_label = labels.columns[predictions.argmax(axis=1)[0]]
print(f"preprocessing: {new_sentence}")
print(f"new_sequence: {new_sequence}")
print(f"new_data: {new_data}")
print(f"Predicted Sentiment: {predicted_sentiment_label}")

preprocessing: nangis nontonnya
new_sequence: [[5]]
new_data: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5]]
Predicted Sentiment: Positif


In [38]:
# New sentence for testing
test_sentence = 'cringe banget filmnya'
new_sentence = preprocess(test_sentence)
# Tokenize and pad the new sentence
new_sequence = loaded_tokenizer.texts_to_sequences([new_sentence])
new_data = pad_sequences(new_sequence, maxlen=max_sequence_length)

# Predict sentiment for the new sentence
predictions = loaded_model.predict(new_data)

# Get the predicted sentiment label
predicted_sentiment_label = labels.columns[predictions.argmax(axis=1)[0]]
print(f"preprocessing: {new_sentence}")
print(f"new_sequence: {new_sequence}")
print(f"new_data: {new_data}")
print(f"Predicted Sentiment: {predicted_sentiment_label}")

preprocessing: cringe filmnya
new_sequence: [[17, 352]]
new_data: [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0  17 352]]
Predicted Sentiment: Negatif


dilihat dari test tes di atas, model masih mampu memprediksi dengan benar

## Link github

Find code and data on github

https://github.com/wellyokt/ML2_DAY24.git