<a href="https://colab.research.google.com/github/windyrahayu45/Proyek-Analisis-Sentimen/blob/main/Proyek_Analisis_Sentimen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import library

In [1]:
import requests
import json
import pandas as pd
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pastikan stopwords bahasa Indonesia tersedia
nltk.download("stopwords")
stop_words = set(stopwords.words("indonesian"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
import requests
import pandas as pd
import time

# Shop ID dan Item ID dari URL produk Shopee Skin 1004
SHOP_ID = 555954448
ITEM_ID = 20748780200
MAX_REVIEWS = 3000
LIMIT = 50

# Set Header & Cookies dari Browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Referer": f"https://shopee.co.id/product/{SHOP_ID}/{ITEM_ID}/",
}



# Fungsi scraping dengan session
def scrape_shopee_reviews(shop_id, item_id, max_reviews=3000):
    session = requests.Session()
    session.headers.update(HEADERS)

    reviews = []
    offset = 0

    while len(reviews) < max_reviews:
        print(f"🔄 Mengambil data dari offset {offset}...")
        url = f"https://shopee.co.id/api/v2/item/get_ratings?itemid={item_id}&shopid={shop_id}&offset={offset}&limit={LIMIT}&type=0"

        try:
            response = session.get(url)
            if response.status_code != 200:
                print("⚠️ Gagal mengambil data, status code:", response.status_code)
                break

            data = response.json()
            if "data" in data and "ratings" in data["data"]:
                for review in data["data"]["ratings"]:
                    reviews.append({
                        "Username": review.get("author_username", ""),
                        "Rating": review.get("rating_star", 0),
                        "Komentar": review.get("comment", "")
                    })

            offset += LIMIT
            if len(data["data"]["ratings"]) < LIMIT:
                break

            time.sleep(1)

        except Exception as e:
            print("⚠️ Terjadi kesalahan:", str(e))
            break

    return pd.DataFrame(reviews)

# Jalankan scraping
df_reviews = scrape_shopee_reviews(SHOP_ID, ITEM_ID, MAX_REVIEWS)

# Simpan hasil scraping ke file CSV
df_reviews.to_csv("shopee_reviews_fixed.csv", index=False)
print("✅ Scraping selesai! Data disimpan sebagai 'shopee_reviews_fixed.csv'.")


🔄 Mengambil data dari offset 0...
🔄 Mengambil data dari offset 50...
🔄 Mengambil data dari offset 100...
🔄 Mengambil data dari offset 150...
🔄 Mengambil data dari offset 200...
🔄 Mengambil data dari offset 250...
🔄 Mengambil data dari offset 300...
🔄 Mengambil data dari offset 350...
🔄 Mengambil data dari offset 400...
🔄 Mengambil data dari offset 450...
🔄 Mengambil data dari offset 500...
🔄 Mengambil data dari offset 550...
🔄 Mengambil data dari offset 600...
🔄 Mengambil data dari offset 650...
🔄 Mengambil data dari offset 700...
🔄 Mengambil data dari offset 750...
🔄 Mengambil data dari offset 800...
🔄 Mengambil data dari offset 850...
🔄 Mengambil data dari offset 900...
🔄 Mengambil data dari offset 950...
🔄 Mengambil data dari offset 1000...
🔄 Mengambil data dari offset 1050...
🔄 Mengambil data dari offset 1100...
🔄 Mengambil data dari offset 1150...
🔄 Mengambil data dari offset 1200...
🔄 Mengambil data dari offset 1250...
🔄 Mengambil data dari offset 1300...
🔄 Mengambil data dari of

In [4]:
# Load dataset hasil scraping
df = pd.read_csv("shopee_reviews_fixed.csv")

# Fungsi membersihkan teks ulasan
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"\d+", "", text)
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = " ".join([word for word in text.split() if word not in stop_words])
        return text
    return ""

df["Komentar_Bersih"] = df["Komentar"].apply(clean_text)

# Label sentimen berdasarkan rating
def label_sentiment(rating):
    if rating >= 4:
        return "Positif"
    elif rating == 3:
        return "Netral"
    else:
        return "Negatif"

df["Sentimen"] = df["Rating"].apply(label_sentiment)

# Simpan hasil preprocessing
df.to_csv("shopee_reviews_labeled.csv", index=False)
print("✅ Data berhasil diproses! Disimpan sebagai 'shopee_reviews_labeled.csv'.")


✅ Data berhasil diproses! Disimpan sebagai 'shopee_reviews_labeled.csv'.


In [5]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["Komentar_Bersih"]).toarray()
y = df["Sentimen"].map({"Positif": 1, "Netral": 0, "Negatif": -1})

# Split data menjadi training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Model Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

# Model SVM
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)


In [7]:
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n📊 Evaluasi Model: {model_name}")
    print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

# Evaluasi masing-masing model
evaluate_model("Naive Bayes", y_test, nb_pred)
evaluate_model("SVM", y_test, svm_pred)
evaluate_model("Random Forest", y_test, rf_pred)



📊 Evaluasi Model: Naive Bayes
Akurasi: 0.9950
Confusion Matrix:
[[  0   0   1]
 [  0   0   2]
 [  0   0 597]]
Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         2
           1       0.99      1.00      1.00       597

    accuracy                           0.99       600
   macro avg       0.33      0.33      0.33       600
weighted avg       0.99      0.99      0.99       600


📊 Evaluasi Model: SVM
Akurasi: 0.9950
Confusion Matrix:
[[  0   0   1]
 [  0   0   2]
 [  0   0 597]]
Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         2
           1       0.99      1.00      1.00       597

    accuracy                           0.99       600
   macro avg       0.33      0.33      0.33       600
weighted avg       0.99      0.99

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
