## Import Package

In [104]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
seed = 0
np.random.seed(seed)
import matplotlib.pyplot as plt
import seaborn as sns

import csv
import requests
from io import StringIO

import datetime as dt
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

!pip install sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from wordcloud import WordCloud

import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Import Data

In [105]:
data = pd.read_json('/content/product_reviews.json')

## Data Eksplorasi

In [106]:
data.shape

(12979, 2)

In [107]:
data.head()

Unnamed: 0,rating,review
0,5,ok dapat free invisible stick
1,5,Nice
2,5,barang telah diterima dengan baik\nterima kasih
3,5,"Barang Sudah sampai,, semoga semua Fitu berfun..."
4,5,"Lengkap sesuai list, trusted"


In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12979 entries, 0 to 12978
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  12979 non-null  int64 
 1   review  12974 non-null  object
dtypes: int64(1), object(1)
memory usage: 202.9+ KB


## Data Cleaning

In [109]:
clean_data = data.dropna()

In [110]:
clean_data = clean_data.drop_duplicates()


In [111]:
clean_data.shape

(12974, 2)

In [112]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')
    return text

def casefoldingText(text):
    text = text.lower()
    return text

def tokenizingText(text):
    text = word_tokenize(text)
    return text

def filteringText(text):
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text):

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()


    words = text.split()


    stemmed_words = [stemmer.stem(word) for word in words]


    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

def toSentence(list_words):
    sentence = ' '.join(word for word in list_words)
    return sentence

In [113]:
slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal"}
def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [114]:
clean_data['text_casefoldingText'] = clean_data['review'].apply(casefoldingText)

clean_data['text_slangwords'] = clean_data['text_casefoldingText'].apply(fix_slangwords)

clean_data['text_tokenizingText'] = clean_data['text_slangwords'].apply(tokenizingText)

clean_data['text_stopword'] = clean_data['text_tokenizingText'].apply(filteringText)

clean_data['text_akhir'] = clean_data['text_stopword'].apply(toSentence)

In [115]:
clean_data

Unnamed: 0,rating,review,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
0,5,ok dapat free invisible stick,ok dapat free invisible stick,ok dapat free invisible stick,"[ok, dapat, free, invisible, stick]","[ok, free, invisible, stick]",ok free invisible stick
1,5,Nice,nice,nice,[nice],[nice],nice
2,5,barang telah diterima dengan baik\nterima kasih,barang telah diterima dengan baik\nterima kasih,barang telah diterima dengan baik terima kasih,"[barang, telah, diterima, dengan, baik, terima...","[barang, diterima, terima, kasih]",barang diterima terima kasih
3,5,"Barang Sudah sampai,, semoga semua Fitu berfun...","barang sudah sampai,, semoga semua fitu berfun...","barang sudah sampai,, semoga semua fitu berfun...","[barang, sudah, sampai, ,, ,, semoga, semua, f...","[barang, ,, ,, semoga, fitu, berfungsi, dgn, ..]","barang , , semoga fitu berfungsi dgn .."
4,5,"Lengkap sesuai list, trusted","lengkap sesuai list, trusted","lengkap sesuai list, trusted","[lengkap, sesuai, list, ,, trusted]","[lengkap, sesuai, list, ,, trusted]","lengkap sesuai list , trusted"
...,...,...,...,...,...,...,...
12974,5,"sellernya top markotop, respon cepat dan memba...","sellernya top markotop, respon cepat dan memba...","sellernya top markotop, respon cepat dan memba...","[sellernya, top, markotop, ,, respon, cepat, d...","[sellernya, top, markotop, ,, respon, cepat, m...","sellernya top markotop , respon cepat membantu..."
12975,5,produk bagus dan pengiriman cepat. mantap........,produk bagus dan pengiriman cepat. mantap........,produk bagus dan pengiriman cepat. mantap........,"[produk, bagus, dan, pengiriman, cepat, ., man...","[produk, bagus, pengiriman, cepat, ., mantap, ...",produk bagus pengiriman cepat . mantap ..........
12976,5,"Respon di diskusi agak lama, tapi barang bagus...","respon di diskusi agak lama, tapi barang bagus...","respon di diskusi agak lama, tapi barang bagus...","[respon, di, diskusi, agak, lama, ,, tapi, bar...","[respon, diskusi, ,, barang, bagus, dites, ber...","respon diskusi , barang bagus dites berfungsi ..."
12977,5,barangnya bagus dan termurah yg sy pilih semog...,barangnya bagus dan termurah yg sy pilih semog...,barangnya bagus dan termurah yg sy pilih semog...,"[barangnya, bagus, dan, termurah, yg, sy, pili...","[barangnya, bagus, termurah, yg, sy, pilih, se...",barangnya bagus termurah yg sy pilih semoga ms...


## Labelling Data

In [116]:
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')

if response.status_code == 200:

    reader = csv.reader(StringIO(response.text), delimiter=',')


    for row in reader:

        lexicon_positive[row[0]] = int(row[1])

else:
    print("Failed to fetch positive lexicon data")

lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')

if response.status_code == 200:

    reader = csv.reader(StringIO(response.text), delimiter=',')


    for row in reader:

        lexicon_negative[row[0]] = int(row[1])

else:
    print("Failed to fetch negative lexicon data")

In [117]:
def sentiment_analysis_lexicon_indonesia(text):

    score = 0

    for word in text:


        if (word in lexicon_positive):
            score = score + lexicon_positive[word]


    for word in text:


        if (word in lexicon_negative):
            score = score + lexicon_negative[word]


    polarity=''

    if (score >= 0):
        polarity = 'positive'

    elif (score < 0):
        polarity = 'negative'



    return score, polarity

In [118]:
results = clean_data['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_data['polarity_score'] = results[0]
clean_data['polarity'] = results[1]
print(clean_data['polarity'].value_counts())

polarity
positive    10397
negative     2577
Name: count, dtype: int64


## Split Data

In [119]:
X = clean_data['text_akhir']
y = clean_data['polarity']


tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)


features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())


features_df


X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

## Training Model

In [120]:
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train.toarray(), y_train)

y_pred_train_lr = logistic_regression.predict(X_train.toarray())
y_pred_test_lr = logistic_regression.predict(X_test.toarray())

accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)

accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)

print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 0.9088544175739474
Logistic Regression - accuracy_test: 0.900578034682081
