In [1]:
import pandas as pd
import sys  
sys.path.insert(1, '/home/tb24/projects/llm-data-aug')

# Path
import os

# Get the project root directory (one level up from the notebook directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define the data path
data_path = os.path.join(project_root, "data")

## Examining Class Imbalance

In [None]:
# Read the cleaned data
df = pd.read_csv(os.path.join(data_path, "cleaned_user_reviews.csv"))
# Make sure they are all of type string
df['emoji to text'] = df['emoji to text'].astype(str)
df

In [None]:
# Định nghĩa mapping
label_mapping = {'Positive': 1, 'Neutral': 2, 'Negative': 0}

# Chuyển đổi cột 'Sentiment' sang số bằng cách sử dụng mapping
df['Sentiment'] = df['Sentiment'].map(label_mapping)

df

In [None]:
sentiment_counts = df['Sentiment'].value_counts()

print("Số lượng nhãn trong cột 'Sentiment':")
print(sentiment_counts)

## Data Augmentation with Resampling

In [None]:
from sklearn.utils import resample

# Create DataFrames for each sentiment class
df_positive = df[df['Sentiment'] == 1]
df_negative = df[df['Sentiment'] == 0]
df_neutral = df[df['Sentiment'] == 2]

# Get size of the majority class
n_samples = len(df_positive)

df_positive.head()

#### Upsampling

In [None]:
# Upsample minority classes to match majority class
df_negative_upsampled = resample(
    df_negative, 
    replace=True,
    n_samples=n_samples,
    random_state=42
)

df_neutral_upsampled = resample(
    df_neutral,
    replace=True,
    n_samples=n_samples,
    random_state=42
)

# Combine upsampled minority classes with majority class
df_upsampled = pd.concat([df_positive, df_negative_upsampled, df_neutral_upsampled])

# Check the distribution of the upsampled dataset
print("Class distribution after upsampling:")
print(df_upsampled['Sentiment'].value_counts())

# Save the upsampled dataset if needed
df_upsampled.to_csv(os.path.join(data_path, "upsampled/upsampled_user_reviews.csv"), index=False)

In [None]:
df_negative_upsampled

### Downsampling

In [None]:
# Size of the minority classes
len(df_negative), len(df_neutral)

In [None]:
# Downsample the majority class to match the minority class
min_samples = min(len(df_negative), len(df_neutral)) 

df_positive_downsampled = resample(
    df_positive,
    replace=False,
    n_samples=min_samples,
    random_state=42
)

df_negative_downsampled = resample(
    df_negative,
    replace=False,
    n_samples=min_samples,
    random_state=42
)


# Combine downsampled majority classes with minority class
df_downsampled = pd.concat([df_positive_downsampled, df_neutral, df_negative_downsampled])

# Check the distribution of the downsampled dataset
print("Class distribution after downsampling:")
print(df_downsampled['Sentiment'].value_counts())

# Save the downsampled dataset if needed
df_downsampled.to_csv(os.path.join(data_path, "downsampled/downsampled_user_reviews.csv"), index=False)


In [None]:
df_downsampled.head()

## Split data into train, val, test

In [2]:
from dataloaders.train_test_split import DataScenario, TrainTestSplit

TrainTestSplit.run_train_test_split(DataScenario.UPSAMPLED)

In [4]:
TrainTestSplit.run_train_test_split(DataScenario.DOWNSAMPLED)