# Proyek Analisis Sentimen pada Google Reviews
- **Nama:** Yosriko Rahmat Karoni Sabelekake
- **Email:** yosrikosabelekake@gmail.com
- **ID Dicoding:** yosriko


# Import modul yang diperlukan

In [1]:
!pip install keras



In [2]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
!pip install transformers torch



In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Data Sentiment Labeling

In [6]:
data = pd.read_csv('/content/dataset/playstore_reviews.csv')
data = data[['Review']]

In [10]:
# Load the saved model and the TF-IDF vectorizer
svm_model = joblib.load('/content/svm_model_scenario_1.pkl')
tfidf = joblib.load('/content/tfidf_vectorizer_scenario_1.pkl')

# Function to predict sentiment of custom sentences
def predict_custom_sentences(custom_sentences):
    # Replace None, NaN, or empty strings with an empty string
    custom_sentences = [sentence if isinstance(sentence, str) else "" for sentence in custom_sentences]

    # Transform custom sentences using the loaded TF-IDF vectorizer
    X_custom = tfidf.transform(custom_sentences)

    # Predict using the loaded SVM model
    predictions = svm_model.predict(X_custom)

    return predictions

# Applying the predictions to the DataFrame
def label_sentiment(text):
    # Use the predict_custom_sentences function to get the sentiment
    prediction = predict_custom_sentences([text])[0]  # Get the first prediction for a single text
    return prediction

# Applying the function to the 'Text' column in your DataFrame
data['Sentiment'] = data['Review'].apply(label_sentiment)


In [11]:
print(data.head())
print(len(data))

                                              Review Sentiment
0  Konsep yang keren menggabungkan 2d dengan 3d d...   Positif
1  Lumayan sulit buat ku karena ada tikus tanah y...   Positif
2  Gamenya bakalan sangat boring di awal2, karna ...   Positif
3  Untuk update selanjutnya tolong tambahkan bebe...   Positif
4  Ini game pembuat nya gak mau buat player senen...   Positif
45588


# Pre-processing Data


In [12]:
def preprocess_text(text):
    # Case Folding: convert text to lowercase
    text = str(text)
    text = text.lower()

    # Removal Special Characters and Digits: remove special characters, digits, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = word_tokenize(text)

    # Stopword Removal (using NLTK and Sastrawi)
    nltk_stopwords = set(stopwords.words('indonesian'))
    filtered_tokens = [word for word in tokens if word not in nltk_stopwords]

    return ' '.join(filtered_tokens)

data['Cleaned_Review'] = data['Review'].apply(preprocess_text)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45588 entries, 0 to 45587
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Review          45587 non-null  object
 1   Sentiment       45588 non-null  object
 2   Cleaned_Review  45588 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


# Scenario 1: SVM, TF-IDF, 80/20 Split


In [14]:
def scenario_1(data):
    # TF-IDF
    tfidf = TfidfVectorizer(max_features=1000)
    X = tfidf.fit_transform(data['Cleaned_Review'])
    y = data['Sentiment']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train SVM
    svm_model = SVC()
    svm_model.fit(X_train, y_train)

    # Evaluation
    y_pred = svm_model.predict(X_test)
    print("Scenario 1 - SVM, TF-IDF, 80/20 Split:")
    print(classification_report(y_test, y_pred))


# Scenario 2: RF, Word2Vec, 80/20 Split

In [15]:
def scenario_2(data):
    # Word2Vec
    tokenized_reviews = [review.split() for review in data['Cleaned_Review']]
    w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

    # Transform reviews into average Word2Vec vectors
    def get_avg_word2vec(tokens):
        vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
        if len(vectors) == 0:
            return np.zeros(100)
        return np.mean(vectors, axis=0)

    X = np.array([get_avg_word2vec(tokens) for tokens in tokenized_reviews])
    y = data['Sentiment']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Evaluation
    y_pred = rf_model.predict(X_test)
    print("\nScenario 2 - RF, Word2Vec, 80/20 Split:")
    print(classification_report(y_test, y_pred))

# Scenario 3: RF, TF-IDF, 70/30 Split


In [16]:
def scenario_3(data):
    # TF-IDF
    tfidf = TfidfVectorizer(max_features=1000)
    X = tfidf.fit_transform(data['Cleaned_Review'])
    y = data['Sentiment']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train Random Forest
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Evaluation
    y_pred = rf_model.predict(X_test)
    print("\nScenario 3 - RF, TF-IDF, 70/30 Split:")
    print(classification_report(y_test, y_pred))

# Running all Scenarios

In [17]:
scenario_1(data)

Scenario 1 - SVM, TF-IDF, 80/20 Split:
              precision    recall  f1-score   support

     Negatif       0.90      0.90      0.90       968
      Netral       0.86      0.77      0.81       411
     Positif       0.98      0.99      0.99      7739

    accuracy                           0.97      9118
   macro avg       0.91      0.89      0.90      9118
weighted avg       0.97      0.97      0.97      9118



In [18]:
scenario_2(data)


Scenario 2 - RF, Word2Vec, 80/20 Split:
              precision    recall  f1-score   support

     Negatif       0.75      0.69      0.72       968
      Netral       0.66      0.40      0.50       411
     Positif       0.94      0.97      0.96      7739

    accuracy                           0.92      9118
   macro avg       0.79      0.69      0.72      9118
weighted avg       0.91      0.92      0.91      9118



In [19]:
scenario_3(data)


Scenario 3 - RF, TF-IDF, 70/30 Split:
              precision    recall  f1-score   support

     Negatif       0.74      0.82      0.78      1484
      Netral       0.75      0.48      0.59       703
     Positif       0.97      0.97      0.97     11490

    accuracy                           0.93     13677
   macro avg       0.82      0.76      0.78     13677
weighted avg       0.93      0.93      0.93     13677

