# SETUP

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from google_play_scraper import reviews, Sort
from pandas import DataFrame
from string import punctuation
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nlp_id.lemmatizer import Lemmatizer
from requests import get
from io import StringIO
from IPython.display import clear_output

import nltk, re, json

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/yelf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/yelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Build slangwords dict

In [6]:
# BUILD slangwords dictionary
# slangwords_dict = {}
# with open('./slangwords.txt', 'r') as file :
#     slangwords = file.readlines()
#     for slang in slangwords:
#         slang = slang.replace('\n', '')
#         slang = slang.split('\t')
#         # print(slang)
#         slangwords_dict.update({slang[0]:slang[1]})
        
# with open('slangwords_dict.txt', 'w') as file:
#     file.write(json.dumps(slangwords_dict))

# Scrapping Dataset

### download data

In [7]:
minecraft_reviews = reviews(
    'com.mojang.minecraftpe',
    lang = 'id',
    country = 'id',
    sort = Sort.MOST_RELEVANT,
    count = 15000
)
# Review the data
for review in minecraft_reviews[0]:
    print(review['content'])

Gameplay sudah bagus, tapi ada sedikit bug pada controlernya. Controler sering kereset, padahal saya sudah tata controler senyamannya. Dan controler tidak bisa diubah tata letaknya, setelah kereset. Tolong perbaiki bug ini karena sangat menggangu permainan saya sering kali keluar world atau relog untuk membenarkan controlernya. Dan itu sangat memakan waktu bagi player. Tolong benarkan bug ini.
Bagus sih,cuma tolong fix di bagian freeze. Maksud saya,layar freeze ini terjadi tidak pada android low end,tapi mid end,Helio g85 dan ram 6 masih mengalami kendala lag,tidak tahu mengapa,namun semenjak renderdragon,sistem rendering yang berat membuat Minecraft menjadi tidak optimal. Tolong di tingkatkan lagi performance nya.a good game it won't work if it lags
Sudah sangat bagus, saya sudah bermain sangat lama, untuk masalah frezze di game, itu karena pengaturan fps yang tak terbatas, di bagian pengaturan video paling bawah, bisa di sesuaikan sesuai device ya, jika ram 2 cukup di 30 fps, jika ra

### create dataframe

In [8]:
minecraft_df = DataFrame(minecraft_reviews[0])

x_review, y_review = minecraft_df.shape
print("Record :", x_review, "baris")
print("Field  :", y_review, "kolom")

Record : 15000 baris
Field  : 11 kolom


# Text Preprocessing

### preprocessing function

In [9]:
# REMOVE SPECIAL CHARACTERS & CASE FOLDING
def cleaning(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text)) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text.lower()

# REMOVE SLANG WORDS
def fixSlangwords(text):
    with open('./slangwords_dict.txt', 'r') as file :
        slangwords = json.loads(file.readline())
        # print(slangwords['gua'])
    
    words = text.split()
    fixed_words = []
    for word in words:
        # print(word.lower() in slangwords)
        if word.lower() in slangwords:
            # print(slangwords[word.lower()])
            fixed_words.append(slangwords[word.lower()])
        else :
            # print(word)
            fixed_words.append(word)
            
        # text = 'gue gua saya @ ambilin'
        # print(fixSlangwords(text))
    
    return ' '.join(fixed_words)

# TOKENIZING
def tokenizer(text):
    return word_tokenize(text)

# REMOVE STOPWORDS
def stopwordsRemove(text):
    with open('stopwords.txt', 'r') as file:
        stopwords = file.readlines()
    stopwordsDict =[]
    for word in stopwords:
        word = word.replace('\n', '')
        stopwordsDict.append(word)
    # print(stopwordsDict)
    # print('every' in stopwordsDict)
    # text = word.tokenize(text)
    # return [word not in stopwordsDict for word in text]
    
    fix_words = []
    for txt in text:
        if txt not in stopwordsDict:
            fix_words.append(txt)
    return fix_words
    
# LEMMATIZING
def lemmatizerWord(text):
    lemmatizer = Lemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in text]

def lemmatizing(data):
    lemmatized_word = []
#     len_data = len(data.tolist())
#     for i,text in enumerate(data.tolist()):
#         lemmatized_word.append(lemmatizerWord(text))
        
#         progress = (i+1) / len_data * 100
#         print("Data-{} | Progress: {:.2f}% | => {}".format(i, progress, lemmatizerWord(text)))
#         clear_output(wait=True)
    return lemmatized_word

# STEMMING
def stemmerWord(text):
    stemmer = StemmerFactory().create_stemmer()
    return [stemmer.stem(word) for word in text]
    
    # stem = minecraft_df['text_stopword'].tolist()
    # stemmed_word = []
    # for i in range(len(stem)):
    #     stemmed_word.append(lemmatizerWord(stem[i]))
    #     print(i+1, lemmatizerWord(stem[i]))
    #     print("Len array : ", len(stemmed_word))
    #     clear_output(wait=True)
    # return stemmed_word

def toSentence(text):
    return ' '.join(word for word in text)

### Apply to dataset

In [10]:
print("Proses Cleaning     : START")
minecraft_df['text_clean'] = minecraft_df.content.apply(cleaning)
print("Proses Cleaning     : DONE\n")

print("Proses Slangword    : START")
minecraft_df['text_slangwords'] = minecraft_df['text_clean'].apply(fixSlangwords)
print("Proses Slangword    : DONE\n")

print("Proses Tokenizing   : START")
minecraft_df['text_tokenizingText'] = minecraft_df['text_slangwords'].apply(tokenizer)
print("Proses Tokenizing   : DONE\n")

print("Proses Stopword     : START")
minecraft_df['text_stopword'] = minecraft_df['text_tokenizingText'].apply(stopwordsRemove)
print("Proses Stopword     : DONE\n")

print("Proses Lemmatizing     : START")
minecraft_df['text_lemmatizing'] = minecraft_df['text_stopword'].apply(lemmatizerWord)
print("Proses Lemmatizing     : DONE\n")

print("Proses Final        : START")
minecraft_df['text_akhir'] = minecraft_df['text_stemming'].apply(toSentence)
print("Proses Final        : DONE\n")

Proses Cleaning     : START
Proses Cleaning     : DONE

Proses Slangword    : START
Proses Slangword    : DONE

Proses Tokenizing   : START
Proses Tokenizing   : DONE

Proses Stopword     : START
Proses Stopword     : DONE

Proses Lemmatizing     : START
Proses Lemmatizing     : DONE

Proses Final        : START


KeyError: 'text_stemming'

# Labelling

In [None]:
response = get(url='https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
def create_dict(url, dictType):
    response = get(url=url)
    if response.status_code == 200:
        reader = csv.reader(StringIO(response.text), delimiter=',')
        if dictType == 'positive':
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_positive[row[0]] = int(row[1])
        else:
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_negative[row[0]] = int(row[1])
    else:
        print("Failed to fetch lexicon data")

lexicon_positive, lexicon_negative = {}, {}
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
    sentiment='' 
    if (score > 0) :sentiment = 'positive'
    elif (score < 0) :sentiment = 'negative'
    else: sentiment = 'neutral'
    return score, sentiment

create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv', "positive")
create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv', "negative")
print("")

results = minecraft_df['text_stemming'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
minecraft_df['polarity_score'] = results[0]
minecraft_df['sentiment'] = results[1]
print(minecraft_df['sentiment'].value_counts())

# Save Dataset