In [25]:
# 1. Import Library
import pandas as pd
import numpy as np
import re
import string
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import csv

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# 2. Load Dataset
df = pd.read_csv('IMDB_Dataset.csv', on_bad_lines='skip', quoting=csv.QUOTE_NONNUMERIC)
df.dropna(inplace=True)

In [27]:
# 3. Preprocessing Function
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\s+', ' ', text).strip()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_review'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# 4. Tokenization and Padding
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X = padded
y = df['label'].values

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Build LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# 7. Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# 8. Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print(f"✅ Test Accuracy: {accuracy:.4f}")
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))

# 9. Save Model and Tokenizer
model.save('sentiment_lstm_model.h5')

import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)



Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 700ms/step - accuracy: 0.5029 - loss: 0.6942 - val_accuracy: 0.5096 - val_loss: 0.6924
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 660ms/step - accuracy: 0.5108 - loss: 0.6932 - val_accuracy: 0.5156 - val_loss: 0.6878
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 687ms/step - accuracy: 0.5446 - loss: 0.6681 - val_accuracy: 0.5291 - val_loss: 0.6806
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 692ms/step - accuracy: 0.5611 - loss: 0.6450 - val_accuracy: 0.5425 - val_loss: 0.6933
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 685ms/step - accuracy: 0.5573 - loss: 0.6261 - val_accuracy: 0.5376 - val_loss: 0.7125
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 697ms/step - accuracy: 0.6478 - loss: 0.5652 - val_accuracy: 0.8378 - val_loss: 0.3924
Epoc



✅ Test Accuracy: 0.8676

✅ Confusion Matrix:
 [[4090  871]
 [ 453 4586]]

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.82      0.86      4961
           1       0.84      0.91      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [71]:
model.save("sentiment_lstm_model.keras")


In [68]:
import tensorflow as tf

model = tf.keras.models.load_model("sentiment_lstm_model.h5")
model.export("sentiment_lstm_model")  # ← ini yang benar sekarang




Saved artifact at 'sentiment_lstm_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 200), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  133640587454480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477982800: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477984720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477985296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477984912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477986064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477985872: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133640477987600: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [69]:
import shutil

shutil.make_archive("sentiment_lstm_model", 'zip', "sentiment_lstm_model")


'/content/sentiment_lstm_model.zip'

In [70]:
from google.colab import files

files.download("sentiment_lstm_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
%%writefile app.py
import streamlit as st
import tensorflow as tf
import pickle
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = tf.keras.models.load_model('sentiment_lstm_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

max_len = 200

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\s+', ' ', text).strip()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

st.set_page_config(page_title="Sentiment Analyzer", layout="centered")
st.title("🧠 Sentiment Analysis with LSTM")

if "log_df" not in st.session_state:
    st.session_state.log_df = pd.DataFrame(columns=["Original Text", "Prediction", "Confidence"])

user_input = st.text_area("✍️ Your Review", "")

if st.button("🔍 Analyze"):
    if user_input.strip() == "":
        st.warning("Please enter a review first.")
    else:
        cleaned = clean_text(user_input)
        seq = tokenizer.texts_to_sequences([cleaned])
        padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
        prob = model.predict(padded)[0][0]
        pred = prob > 0.5
        label = "😊 Positive" if pred else "😠 Negative"
        confidence = f"{prob*100:.2f}%" if pred else f"{(1 - prob)*100:.2f}%"

        st.success(f"Prediction: {label}")
        st.info(f"Confidence: {confidence}")

        words = user_input.split()
        word_count = len(words)
        char_count = len(user_input)
        avg_word_len = char_count / word_count if word_count else 0

        st.subheader("📊 Text Statistics")
        st.markdown(f"""
        - **Word count:** {word_count}
        - **Character count:** {char_count}
        - **Avg. word length:** {avg_word_len:.2f}
        """)

        new_row = {
            "Original Text": user_input,
            "Prediction": label,
            "Confidence": confidence
        }
        st.session_state.log_df = pd.concat([st.session_state.log_df, pd.DataFrame([new_row])], ignore_index=True)

if not st.session_state.log_df.empty:
    st.subheader("📝 Prediction Log")
    st.dataframe(st.session_state.log_df[::-1], use_container_width=True)

    csv = st.session_state.log_df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="📥 Download Log as CSV",
        data=csv,
        file_name='sentiment_prediction_log.csv',
        mime='text/csv',
    )

    if st.button("🗑️ Clear Log"):
        st.session_state.log_df = pd.DataFrame(columns=["Original Text", "Prediction", "Confidence"])
        st.success("Log has been cleared.")


Overwriting app.py


In [52]:
!pip install -q pyngrok streamlit nltk
!nltk.download('stopwords')

/bin/bash: -c: line 1: syntax error near unexpected token `'stopwords''
/bin/bash: -c: line 1: `nltk.download('stopwords')'


In [57]:
!npm install -g wait-on

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K
added 34 packages in 8s
[1G[0K⠇[1G[0K
[1G[0K⠇[1G[0K7 packages are looking for funding
[1G[0K⠇[1G[0K  run `npm fund` for details
[1G[0K⠇[1G[0K

In [53]:
!ngrok config add-authtoken UR AUTH TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [62]:
!streamlit run app.py & npx wait-on http://localhost:8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.236.189.217:8501[0m
[0m
[1G[0K⠙[1G[0K



[34m  Stopping...[0m


In [63]:
from pyngrok import ngrok

# Buka tunnel ke localhost:8501 (default Streamlit port)
public_url = ngrok.connect("http://localhost:8501")
print(f"✅ Streamlit app is live at: {public_url}")


✅ Streamlit app is live at: NgrokTunnel: "https://0d06-35-236-189-217.ngrok-free.app" -> "http://localhost:8501"
