In [2]:
print('hello')

hello


In [1]:
import pandas as pd

try:
    df = pd.read_csv('gojek_reviews_relevant_sentiment.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('gojek_reviews_relevant_sentiment.csv', encoding='latin-1')
    except Exception as e:
        print(f"Error loading the file: {e}")
        df = None

if df is not None:
    text_column = 'content' # Assuming the column name is 'content' for the reviews

    display(df.head())
    print(f"DataFrame shape: {df.shape}")
    print(f"Text column: {text_column}")

Unnamed: 0,user,review,rating
0,prabandaru koesworo,"Ini aplikasi jangan pake gopaynya, gopaynya ma...",1
1,Jerry h,"Kalau ketentuan untuk semua Gocar, bisa ditera...",2
2,Desy Adela,"Pesan gosend sampai 2 jam Pertama, pesan sudah...",1
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,1
4,Ridwan Mono,Benar2 membantu. Cuma kadang2 apk-nya suka err...,5


DataFrame shape: (500, 3)
Text column: content


<h1>Preprocessing</h1>
<h2>Case folding</h2>

In [2]:
df['review'] = df['review'].str.lower()
print(df['review'].head())

0    ini aplikasi jangan pake gopaynya, gopaynya ma...
1    kalau ketentuan untuk semua gocar, bisa ditera...
2    pesan gosend sampai 2 jam pertama, pesan sudah...
3    pesan makan udh sejam tp gaada driver yang mau...
4    benar2 membantu. cuma kadang2 apk-nya suka err...
Name: review, dtype: object


<h2>Tokenize</h2>

In [3]:
from spacy.lang.id import Indonesian
# load language model bahasa Indonesia
spacy_id = Indonesian()

# Menerapkan model spaCy pada teks dalam kolom 'review'
df['review_tokenized'] = df['review'].apply(lambda text: spacy_id(text))

# Jika ingin menampilkan dataframe
display(df['review_tokenized'])

0      (ini, aplikasi, jangan, pake, gopaynya, ,, gop...
1      (kalau, ketentuan, untuk, semua, gocar, ,, bis...
2      (pesan, gosend, sampai, 2, jam, pertama, ,, pe...
3      (pesan, makan, udh, sejam, tp, gaada, driver, ...
4      (benar2, membantu, ., cuma, kadang2, apk, -nya...
                             ...                        
495    (dapet, driver, susah, banget, ,, pasti, di, c...
496    (lumayan, jarang, gunain, fitur, gocar, ,, ser...
497    (sejak, versi, terbaru, ,, pemesanan, (, go-fo...
498    (aplikasi, yg, mnurut, saya, merugikan, penggu...
499    (nama, gonta, ganti, terus, ,, tapi, slot, dan...
Name: review_tokenized, Length: 500, dtype: object

<h2>Stopword removal</h2>

In [4]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Mendapatkan daftar stopwords dan pembuatan stopword remover
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()
sastrawi_stw_id = factory.get_stop_words()

# Fungsi untuk menghapus stopwords
def remove_stopwords(text):
    # Menghapus stopwords langsung menggunakan Sastrawi
    return stopword_remover.remove(text)

# Menerapkan fungsi ke kolom 'review'
df['no_stopword'] = df['review'].apply(remove_stopwords)

In [5]:
df.head()

Unnamed: 0,user,review,rating,review_tokenized,no_stopword
0,prabandaru koesworo,"ini aplikasi jangan pake gopaynya, gopaynya ma...",1,"(ini, aplikasi, jangan, pake, gopaynya, ,, gop...","aplikasi jangan pake gopaynya, gopaynya makan ..."
1,Jerry h,"kalau ketentuan untuk semua gocar, bisa ditera...",2,"(kalau, ketentuan, untuk, semua, gocar, ,, bis...","kalau ketentuan semua gocar, diterapkan kyk pe..."
2,Desy Adela,"pesan gosend sampai 2 jam pertama, pesan sudah...",1,"(pesan, gosend, sampai, 2, jam, pertama, ,, pe...","pesan gosend 2 jam pertama, pesan dapat driver..."
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,1,"(pesan, makan, udh, sejam, tp, gaada, driver, ...",pesan makan udh sejam tp gaada driver mau ambi...
4,Ridwan Mono,benar2 membantu. cuma kadang2 apk-nya suka err...,5,"(benar2, membantu, ., cuma, kadang2, apk, -nya...",benar2 membantu. cuma kadang2 apk-nya suka err...


<h2>Stemming</h2>

In [6]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer = StemmerFactory().create_stemmer()

def stemmed(text):
    # Apply stemming to the text
    return stemmer.stem(text)

df['stemmed_review'] = df['review'].apply(stemmed)

In [7]:
df.head()

Unnamed: 0,user,review,rating,review_tokenized,no_stopword,stemmed_review
0,prabandaru koesworo,"ini aplikasi jangan pake gopaynya, gopaynya ma...",1,"(ini, aplikasi, jangan, pake, gopaynya, ,, gop...","aplikasi jangan pake gopaynya, gopaynya makan ...",ini aplikasi jangan pake gopaynya gopaynya mak...
1,Jerry h,"kalau ketentuan untuk semua gocar, bisa ditera...",2,"(kalau, ketentuan, untuk, semua, gocar, ,, bis...","kalau ketentuan semua gocar, diterapkan kyk pe...",kalau tentu untuk semua gocar bisa terap kyk a...
2,Desy Adela,"pesan gosend sampai 2 jam pertama, pesan sudah...",1,"(pesan, gosend, sampai, 2, jam, pertama, ,, pe...","pesan gosend 2 jam pertama, pesan dapat driver...",pesan gosend sampai 2 jam pertama pesan sudah ...
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,1,"(pesan, makan, udh, sejam, tp, gaada, driver, ...",pesan makan udh sejam tp gaada driver mau ambi...,pesan makan udh jam tp gaada driver yang mau a...
4,Ridwan Mono,benar2 membantu. cuma kadang2 apk-nya suka err...,5,"(benar2, membantu, ., cuma, kadang2, apk, -nya...",benar2 membantu. cuma kadang2 apk-nya suka err...,benar2 bantu cuma kadang2 apk-nya suka error p...


<h2>Lemmatize</h2>

In [8]:
I = "perayaan itu berbarengan dengan saat kita bepergian ke Jogjakarta"
idn = spacy_id(I)
print( ' '.join( k.lemma_ for k in idn ) )

        


In [9]:
print([k.lemma_ for k in spacy_id("Perayaan Bepergian")])

['', '']


In [8]:
import pandas as pd
from transformers import pipeline

# Load the NLP pipeline
nlp = pipeline(
    "token-classification", 
    model="cahya/bert-base-indonesian-NER",
    tokenizer="cahya/bert-base-indonesian-NER"
)

# Function to lemmatize text using the transformer model
def lemmatize_with_transformer(text):
    # Skip processing if text is not a string or is empty
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ""
    
    # Use the model to get tokens and tags
    results = nlp(text)
    
    # Extract tokens
    tokens = []
    current_token = ""
    for result in results:
        if result['word'].startswith('##'):
            current_token += result['word'][2:]
        else:
            if current_token:
                tokens.append(current_token)
            current_token = result['word']
    if current_token:
        tokens.append(current_token)
    
    # Join tokens to form lemmatized text
    return " ".join(tokens)

# Apply the lemmatization function to the 'review' column
# Replace 'review' with your actual column name
df['lemmatized_review'] = df['review'].apply(lemmatize_with_transformer)

# Save the updated DataFrame to a new CSV if needed
df.to_csv('lemmatized_output.csv', index=False)

print("Lemmatization complete!")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to us

Lemmatization complete!
