In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from bs4 import BeautifulSoup
from transformers import BertTokenizer


In [2]:
label2id = {
    "Anger": 0,
    "Fear": 1,
    "Happy": 2,
    "Love": 3,
    "Sadness": 4
}
id2label = {v: k for k, v in label2id.items()}

In [3]:
data = pd.read_csv("PRDECT-ID Dataset.csv")  # Adjust the filename
data = data[['Customer Review', 'Emotion']]
data.rename(columns={'Customer Review': 'text', 'Emotion': 'label'}, inplace=True)
data['label'] = data['label'].map(label2id)
data['label'].value_counts()


label
2    1770
4    1202
1     920
3     809
0     699
Name: count, dtype: int64

In [4]:
# Define target sample size
target_size = 300

# Separate classes
data_1 = data[data['label'] == 1]
data_2 = data[data['label'] == 2]
data_3 = data[data['label'] == 3]
data_0 = data[data['label'] == 0]
data_4 = data[data['label'] == 4]


# Randomly sample from the smaller classes to match the target size
data_1_resampled = data_1.sample(n=target_size, replace=True, random_state=42)
data_2_resampled = data_2.sample(n=target_size, replace=True, random_state=42)
data_3_resampled = data_3.sample(n=target_size, replace=True, random_state=42)
data_4_resampled = data_4.sample(n=target_size, replace=True, random_state=42)
data_0_resampled = data_0.sample(n=target_size, replace=True, random_state=42)

# Combine the resampled data
balanced_data = pd.concat([data_1_resampled,
                           data_2_resampled,
                           data_3_resampled,
                           data_4_resampled,
                           data_0_resampled])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

x_train, x_val, y_train, y_val = train_test_split(balanced_data['text'], balanced_data['label'], test_size=0.2, random_state=42)

# Save train test split
train = pd.DataFrame({'text': x_train, 'label': y_train})
test = pd.DataFrame({'text': x_val, 'label': y_val})
train.to_csv('dataset/train.csv', index=False)
test.to_csv('dataset/test.csv', index=False)

In [None]:
#imports
import nlpaug.augmenter.word as naw


aug = naw.WordEmbsAug(model_type='fasttext',
                      model_path="wiki.id.vec",top_k=5)

text= "Saya sangat senang hari ini"
augmented_text = aug.augment(text)

print("Original:")
print(text) 
print("Augmented Text:")
print(augmented_text)


Original:
Saya sangat senang hari ini
Augmented Text:
['Saya sangat senang peringati mimapatelarthron']


In [None]:
# augment train
train = pd.read_csv('dataset/train.csv')
train_augmented = train.copy()
train_augmented['text'] = train_augmented['text'].apply(lambda x: aug.augment(x))
train_augmented.to_csv('dataset/train_augmented.csv', index=False)
