# Task
lakukan preprocessing: casefolding, transformasi, tokenizing, stopword removal, stemming, lemmatization. Lalu hitung frekuensi/bobot kata menggunakan bag of word. Dan akhirnya buat text classification dengan data tersebut menggunakan transformer

Here is all the data you need:
"gojek_reviews_relevant.csv"

## Data loading

### Subtask:
Load the data from the "gojek_reviews_relevant.csv" file into a pandas DataFrame.


**Reasoning**:
Load the data from the CSV file into a pandas DataFrame and identify the text column.



In [None]:
!pip install huggingface_hub



In [None]:
import pandas as pd

try:
    df = pd.read_csv('gojek_reviews_relevant.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('gojek_reviews_relevant.csv', encoding='latin-1')
    except Exception as e:
        print(f"Error loading the file: {e}")
        df = None

if df is not None:
    text_column = 'content' # Assuming the column name is 'content' for the reviews

    display(df.head())
    print(f"DataFrame shape: {df.shape}")
    print(f"Text column: {text_column}")

Unnamed: 0,user,review
0,prabandaru koesworo,"Ini aplikasi jangan pake gopaynya, gopaynya ma..."
1,Jerry h,"Kalau ketentuan untuk semua Gocar, bisa ditera..."
2,Desy Adela,"Pesan gosend sampai 2 jam Pertama, pesan sudah..."
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...
4,Ridwan Mono,Benar2 membantu. Cuma kadang2 apk-nya suka err...


DataFrame shape: (500, 2)
Text column: content


## Data cleaning

### Subtask:
Clean the text data in the 'review' column of the DataFrame.


**Reasoning**:
Clean the text data in the 'review' column by handling missing values, removing irrelevant characters, and considering the preservation of emojis.



In [None]:
import re

# 1. Handle missing values
if df['review'].isnull().any():
    df['review'].fillna("", inplace=True)

# 2. Remove irrelevant characters
def clean_text(text):
    # Remove punctuation, special characters, and numbers, but preserve emojis
    text = re.sub(r'[^\w\s\U00010000-\U0010ffff]', '', text)  # Keep emojis and alphanumeric
    # Remove URLs (basic pattern matching)
    text = re.sub(r'http\S+', '', text)
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

# Display first few rows to check the cleaning process
display(df.head())

Unnamed: 0,user,review,cleaned_review
0,prabandaru koesworo,"Ini aplikasi jangan pake gopaynya, gopaynya ma...",Ini aplikasi jangan pake gopaynya gopaynya mak...
1,Jerry h,"Kalau ketentuan untuk semua Gocar, bisa ditera...",Kalau ketentuan untuk semua Gocar bisa diterap...
2,Desy Adela,"Pesan gosend sampai 2 jam Pertama, pesan sudah...",Pesan gosend sampai 2 jam Pertama pesan sudah ...
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,pesan makan udh sejam tp gaada driver yang mau...
4,Ridwan Mono,Benar2 membantu. Cuma kadang2 apk-nya suka err...,Benar2 membantu Cuma kadang2 apknya suka error...


## Data wrangling

### Subtask:
Preprocess the text data in the 'cleaned_review' column.


**Reasoning**:
Perform casefolding, tokenization, stop word removal, stemming, and lemmatization on the 'cleaned_review' column.



**Reasoning**:
The previous code failed due to a missing NLTK resource.  I need to download the required resource `punkt_tab` before retrying the code.



In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Reasoning**:
Retry the previous preprocessing steps now that the necessary NLTK resource has been downloaded.



In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# 1. Casefolding (already done in the previous attempt)

# 2. Tokenization
df['tokenized_review'] = df['casefolded_review'].apply(word_tokenize)

# 3. Stop Word Removal
stop_words = set(stopwords.words('indonesian'))
# Add more stop words if needed
df['nostopword_review'] = df['tokenized_review'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# 4. Stemming
stemmer = PorterStemmer()
df['stemmed_review'] = df['nostopword_review'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_review'] = df['nostopword_review'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

display(df.head())

Unnamed: 0,user,review,cleaned_review,casefolded_review,tokenized_review,nostopword_review,stemmed_review,lemmatized_review
0,prabandaru koesworo,"Ini aplikasi jangan pake gopaynya, gopaynya ma...",Ini aplikasi jangan pake gopaynya gopaynya mak...,ini aplikasi jangan pake gopaynya gopaynya mak...,"[ini, aplikasi, jangan, pake, gopaynya, gopayn...","[aplikasi, pake, gopaynya, gopaynya, makan, bi...","[aplikasi, pake, gopaynya, gopaynya, makan, bi...","[aplikasi, pake, gopaynya, gopaynya, makan, bi..."
1,Jerry h,"Kalau ketentuan untuk semua Gocar, bisa ditera...",Kalau ketentuan untuk semua Gocar bisa diterap...,kalau ketentuan untuk semua gocar bisa diterap...,"[kalau, ketentuan, untuk, semua, gocar, bisa, ...","[ketentuan, gocar, diterapkan, kyk, peraturan,...","[ketentuan, gocar, diterapkan, kyk, peraturan,...","[ketentuan, gocar, diterapkan, kyk, peraturan,..."
2,Desy Adela,"Pesan gosend sampai 2 jam Pertama, pesan sudah...",Pesan gosend sampai 2 jam Pertama pesan sudah ...,pesan gosend sampai 2 jam pertama pesan sudah ...,"[pesan, gosend, sampai, 2, jam, pertama, pesan...","[pesan, gosend, 2, jam, pesan, driver, gak, ge...","[pesan, gosend, 2, jam, pesan, driver, gak, ge...","[pesan, gosend, 2, jam, pesan, driver, gak, ge..."
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,pesan makan udh sejam tp gaada driver yang mau...,pesan makan udh sejam tp gaada driver yang mau...,"[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ..."
4,Ridwan Mono,Benar2 membantu. Cuma kadang2 apk-nya suka err...,Benar2 membantu Cuma kadang2 apknya suka error...,benar2 membantu cuma kadang2 apknya suka error...,"[benar2, membantu, cuma, kadang2, apknya, suka...","[benar2, membantu, kadang2, apknya, suka, erro...","[benar2, membantu, kadang2, apknya, suka, erro...","[benar2, membantu, kadang2, apknya, suka, erro..."


## Data preparation

### Subtask:
Prepare the data for the Bag-of-Words model.


**Reasoning**:
Join the tokens in the 'lemmatized_review' column back into strings, then use CountVectorizer to create a document-term matrix.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Join tokens back into strings
df['lemmatized_string'] = df['lemmatized_review'].apply(lambda tokens: ' '.join(tokens))

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=1000) # Example, adjust max_features as needed

# Fit and transform the data
dtm = vectorizer.fit_transform(df['lemmatized_string'])

# Display the shape of the DTM
print(f"Shape of Document-Term Matrix: {dtm.shape}")

display(df.head())

Shape of Document-Term Matrix: (500, 1000)


Unnamed: 0,user,review,cleaned_review,casefolded_review,tokenized_review,nostopword_review,stemmed_review,lemmatized_review,lemmatized_string
0,prabandaru koesworo,"Ini aplikasi jangan pake gopaynya, gopaynya ma...",Ini aplikasi jangan pake gopaynya gopaynya mak...,ini aplikasi jangan pake gopaynya gopaynya mak...,"[ini, aplikasi, jangan, pake, gopaynya, gopayn...","[aplikasi, pake, gopaynya, gopaynya, makan, bi...","[aplikasi, pake, gopaynya, gopaynya, makan, bi...","[aplikasi, pake, gopaynya, gopaynya, makan, bi...",aplikasi pake gopaynya gopaynya makan biaya ad...
1,Jerry h,"Kalau ketentuan untuk semua Gocar, bisa ditera...",Kalau ketentuan untuk semua Gocar bisa diterap...,kalau ketentuan untuk semua gocar bisa diterap...,"[kalau, ketentuan, untuk, semua, gocar, bisa, ...","[ketentuan, gocar, diterapkan, kyk, peraturan,...","[ketentuan, gocar, diterapkan, kyk, peraturan,...","[ketentuan, gocar, diterapkan, kyk, peraturan,...",ketentuan gocar diterapkan kyk peraturan blueb...
2,Desy Adela,"Pesan gosend sampai 2 jam Pertama, pesan sudah...",Pesan gosend sampai 2 jam Pertama pesan sudah ...,pesan gosend sampai 2 jam pertama pesan sudah ...,"[pesan, gosend, sampai, 2, jam, pertama, pesan...","[pesan, gosend, 2, jam, pesan, driver, gak, ge...","[pesan, gosend, 2, jam, pesan, driver, gak, ge...","[pesan, gosend, 2, jam, pesan, driver, gak, ge...",pesan gosend 2 jam pesan driver gak gerakgerak...
3,Dewita Nur Wulandari,pesan makan udh sejam tp gaada driver yang mau...,pesan makan udh sejam tp gaada driver yang mau...,pesan makan udh sejam tp gaada driver yang mau...,"[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ...","[pesan, makan, udh, sejam, tp, gaada, driver, ...",pesan makan udh sejam tp gaada driver ambil me...
4,Ridwan Mono,Benar2 membantu. Cuma kadang2 apk-nya suka err...,Benar2 membantu Cuma kadang2 apknya suka error...,benar2 membantu cuma kadang2 apknya suka error...,"[benar2, membantu, cuma, kadang2, apknya, suka...","[benar2, membantu, kadang2, apknya, suka, erro...","[benar2, membantu, kadang2, apknya, suka, erro...","[benar2, membantu, kadang2, apknya, suka, erro...",benar2 membantu kadang2 apknya suka error penc...


In [None]:
import pandas as pd

# Assuming 'dtm' is your document-term matrix and 'vectorizer' is your CountVectorizer instance

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame from the document-term matrix
dtm_df = pd.DataFrame(dtm.toarray(), columns=feature_names, index=df.index)

# Concatenate the original DataFrame with the DTM DataFrame
final_df = pd.concat([df, dtm_df], axis=1)

# Save to CSV
final_df.to_csv('gojek_reviews_with_dtm.csv', index=False)

## Feature engineering

### Subtask:
Enhance the Bag-of-Words representation using TF-IDF.


**Reasoning**:
Apply TF-IDF transformation to the document-term matrix.



In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

# Initialize TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Fit and transform the DTM
tfidf_matrix = tfidf_transformer.fit_transform(dtm)

# Print the shape of the TF-IDF matrix
print(f"Shape of TF-IDF Matrix: {tfidf_matrix.shape}")

Shape of TF-IDF Matrix: (500, 1000)


## Data splitting

### Subtask:
Split the data into training, validation, and testing sets.


**Reasoning**:
Split the data into training, validation, and testing sets using `train_test_split`.  Stratify the splits based on the target variable (assuming it's in the original dataframe `df`).



In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'label' is the target variable column in the dataframe.
# Replace 'label' with the actual column name if different.
if 'label' not in df.columns:
    print("Error: 'label' column not found in dataframe. Please specify the correct target variable column name.")
    # Handle the error appropriately, e.g., exit the script or use a default value.
    # For now, I will use a placeholder target variable for demonstration.
    df['label'] = 0  # Replace with your actual target variable.
    print("Warning: Using a placeholder target variable. Please correct this.")

# Split the data into training and temporary sets
X_train, X_temp, y_train, y_temp = train_test_split(
    tfidf_matrix, df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Split the temporary set into validation and testing sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

Error: 'label' column not found in dataframe. Please specify the correct target variable column name.
X_train shape: (400, 1000)
X_val shape: (50, 1000)
X_test shape: (50, 1000)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# 2. Train the classifier using the training data
nb_classifier.fit(X_train, y_train)

# 3. Make predictions on the validation data
y_pred_val = nb_classifier.predict(X_val)

# 4. Evaluate the model's performance on the validation data
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val}")
print(classification_report(y_val, y_pred_val))

# 5. (Optional) Make predictions on the test data for final evaluation
y_pred_test = nb_classifier.predict(X_test)

# 6. (Optional) Evaluate the model's performance on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {accuracy_test}")
print(classification_report(y_test, y_pred_test))

Validation Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50

Test Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [None]:
# Assuming you have already trained the nb_classifier and have the vectorizer

# Sample text to classify
new_text = "Drivernya jelek  dan lama sampai tujuan."

# Preprocess the new text (similar to how you preprocessed your training data)
new_text_cleaned = clean_text(new_text) # Assuming you have the clean_text function
new_text_casefolded = new_text_cleaned.lower()
new_text_tokenized = word_tokenize(new_text_casefolded)
new_text_nostopword = [word for word in new_text_tokenized if word not in stop_words]
new_text_lemmatized = [lemmatizer.lemmatize(word) for word in new_text_nostopword]
new_text_joined = ' '.join(new_text_lemmatized)

# Vectorize the new text using the same vectorizer used for training
new_text_vectorized = vectorizer.transform([new_text_joined])

# Make prediction using the trained classifier
predicted_class = nb_classifier.predict(new_text_vectorized)[0]

# Print the predicted class
print(f"Predicted class for the text: {predicted_class}")

Predicted class for the text: 0


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

## Model training

### Subtask:
Train a transformer-based text classification model.


**Reasoning**:
Train a transformer-based text classification model using the prepared data.



In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Get the indices of the training, validation, and testing sets
train_indices = X_train.nonzero()[0]
val_indices = X_val.nonzero()[0]
test_indices = X_test.nonzero()[0]

# Create datasets using the correct indices and convert labels to float
train_dataset = Dataset.from_dict({"text": df['lemmatized_string'].iloc[train_indices].tolist(), "label": y_train.iloc[train_indices].astype(float).tolist()})
val_dataset = Dataset.from_dict({"text": df['lemmatized_string'].iloc[val_indices].tolist(), "label": y_val.iloc[val_indices].astype(float).tolist()})
test_dataset = Dataset.from_dict({"text": df['lemmatized_string'].iloc[test_indices].tolist(), "label": y_test.iloc[test_indices].astype(float).tolist()})

# Initialize tokenizer and model
model_name = "indobenchmark/indobert-base-p2" # Example model, replace with suitable model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['label'].unique()))

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) # add max_length

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3, # Adjust as needed
    weight_decay=0.01,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Save the trained model
model_path = "./trained_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Method 1: Push using the trainer (pushes both model and configuration)
trainer.push_to_hub()

# Method 2: Alternative explicit way to push both model and tokenizer
# from transformers import push_to_hub_python_state_dict

# # Push model, tokenizer and configuration to hub
# model.push_to_hub(repository_name)
# tokenizer.push_to_hub(repository_name)

# Verify upload was successful
# api = HfApi()
# model_info = api.model_info(repository_name)
# print(f"Model successfully uploaded to: https://huggingface.co/yweslakarep/Indobert-finetuned-gojek-review")
# print(f"Model revision: {model_info.sha}")

# # You can also list all files in the repository to confirm both model and tokenizer were uploaded
# files = api.list_repo_files(repository_name)
# print("\nFiles in repository:")
# for file in files:
#     print(f" - {file}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6934 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Map:   0%|          | 0/868 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myweslakarep[0m ([33myweslakrp[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0222,0.000605
2,0.0023,8e-06
3,0.0007,1e-05


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

events.out.tfevents.1741403243.24a9e1458901.877.0:   0%|          | 0.00/7.53k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yweslakarep/results/commit/0cb1638618c4af26a3dfd419d5063e9c05669069', commit_message='End of training', commit_description='', oid='0cb1638618c4af26a3dfd419d5063e9c05669069', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yweslakarep/results', endpoint='https://huggingface.co', repo_type='model', repo_id='yweslakarep/results'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import HfApi
repository_name = 'yweslakarep/Indobert-finetuned-gojek-review-relevant'
tokenizer.push_to_hub(repository_name)

# Verify upload was successful
api = HfApi()
model_info = api.model_info(repository_name)
print(f"Model successfully uploaded to: yweslakarep/Indobert-finetuned-gojek-review-relevant")
print(f"Model revision: {model_info.sha}")

# You can also list all files in the repository to confirm both model and tokenizer were uploaded
files = api.list_repo_files(repository_name)
print("\nFiles in repository:")
for file in files:
    print(f" - {file}")
print(f"Model successfully uploaded to: yweslakarep/Indobert-finetuned-gojek-review-relevant")
print(f"Model revision: {model_info.sha}")

# You can also list all files in the repository to confirm both model and tokenizer were uploaded
files = api.list_repo_files(repository_name)
print("\nFiles in repository:")
for file in files:
    print(f" - {file}")

No files have been modified since last commit. Skipping to prevent empty commit.


Model successfully uploaded to: yweslakarep/Indobert-finetuned-gojek-review-relevant
Model revision: 75b92848cab949ed46e57647f3c85a84cb32a84a

Files in repository:
 - .gitattributes
 - README.md
 - config.json
 - model.safetensors
 - runs/Mar08_03-07-19_24a9e1458901/events.out.tfevents.1741403243.24a9e1458901.877.0
 - special_tokens_map.json
 - tokenizer.json
 - tokenizer_config.json
 - training_args.bin
 - vocab.txt
Model successfully uploaded to: yweslakarep/Indobert-finetuned-gojek-review-relevant
Model revision: 75b92848cab949ed46e57647f3c85a84cb32a84a

Files in repository:
 - .gitattributes
 - README.md
 - config.json
 - model.safetensors
 - runs/Mar08_03-07-19_24a9e1458901/events.out.tfevents.1741403243.24a9e1458901.877.0
 - special_tokens_map.json
 - tokenizer.json
 - tokenizer_config.json
 - training_args.bin
 - vocab.txt


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Replace with your actual model repository name
model_repo = "yweslakarep/Indobert-finetuned-gojek-review-relevant"

# Load the model and tokenizer from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_repo)
model = AutoModelForSequenceClassification.from_pretrained(model_repo)

# Set the model to evaluation mode
model.eval()

def classify_text(text):
    """
    Classify a single text using the loaded model

    Args:
        text (str): The text to classify

    Returns:
        dict: Dictionary containing the predicted label and confidence scores
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    # Get the predicted class and probabilities
    probabilities = torch.nn.functional.softmax(predictions, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()
    confidence = probabilities[0][predicted_class].item()

    # Get all class probabilities as a dictionary
    all_probs = {i: prob.item() for i, prob in enumerate(probabilities[0])}

    # If you have a label mapping, you can convert the numeric label to text
    # label_mapping = {0: "class_a", 1: "class_b", ...}  # Replace with your actual mapping
    # predicted_label = label_mapping[predicted_class]

    return {
        "predicted_class": predicted_class,  # Numeric class
        # "predicted_label": predicted_label,  # Uncomment if you have a label mapping
        "confidence": confidence,
        "all_probabilities": all_probs
    }

# Example usage for a single text
text_to_classify = "Aplikasi ini sangat membantu! Antarmukanya intuitif dan mudah digunakan. Semua fitur bekerja dengan sangat baik, terutama fitur notifikasi yang selalu tepat waktu."
result = classify_text(text_to_classify)
print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.4f}")

# Batch classification example
def classify_batch(texts):
    """
    Classify a batch of texts

    Args:
        texts (list): List of texts to classify

    Returns:
        list: List of dictionaries with classification results
    """
    # Tokenize the batch
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    # Process results
    results = []
    probabilities = torch.nn.functional.softmax(predictions, dim=-1)
    predicted_classes = torch.argmax(predictions, dim=-1)

    for i, (pred_class, probs) in enumerate(zip(predicted_classes, probabilities)):
        pred_class_idx = pred_class.item()
        confidence = probs[pred_class_idx].item()

        results.append({
            "text": texts[i],
            "predicted_class": pred_class_idx,
            "confidence": confidence
        })

    return results

# Example batch classification
sample_texts = [
    "Aplikasi ini luar biasa! ",
    "Aplikasi ini sangat mengecewakan",
    "Aplikasi ini cukup oke",
    "Aplikasi ini bagus"
]



batch_results = classify_batch(sample_texts)
for i, result in enumerate(batch_results):
    print(f"Text {i+1}: Class {result['predicted_class']} (Confidence: {result['confidence']:.4f})")

Predicted class: 0
Confidence: 1.0000
Text 1: Class 0 (Confidence: 1.0000)
Text 2: Class 0 (Confidence: 1.0000)
Text 3: Class 0 (Confidence: 1.0000)
Text 4: Class 0 (Confidence: 1.0000)
