In [2]:
import pandas as pd
import sys  
sys.path.insert(1, '/home/tb24/projects/llm-data-aug')

# Path
import os

# Get the project root directory (one level up from the notebook directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define the data path
data_path = os.path.join(project_root, "data")

### Concat checkpoints to form a complete dataset

In [10]:
dir = "llm_generated/gemini-2.0-flash"

files = os.listdir(os.path.join(data_path, dir))
# Negative sentiment
# files = [f for f in files if f.endswith(".csv") and "auggpt_augmented_user_reviews_negative" in f]

# Neutral sentiment
files = [f for f in files if f.endswith(".csv") and "auggpt_augmented_user_reviews_neutral" in f]   

df = pd.concat([pd.read_csv(os.path.join(data_path, dir, f)) for f in files])
df.head()



Unnamed: 0,Review,Sentiment
0,không được mát lắm,2
1,Nó không thực sự mát.,2
2,Nó không mát như tôi mong đợi.,2
3,Độ mát của nó không được tốt lắm.,2
4,Tôi không thấy nó mát lắm.,2


In [11]:
# df.to_csv(os.path.join(data_path, dir, "auggpt_augmented_user_reviews_negative.csv"), index=False)
df.to_csv(os.path.join(data_path, dir, "auggpt_augmented_user_reviews_neutral.csv"), index=False)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1806 entries, 0 to 119
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     1806 non-null   object
 1   Sentiment  1806 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 42.3+ KB


In [30]:
### Concat the Neutral and Negative Sentiment with the Positive Sentiment
neutral = pd.read_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_augmented_user_reviews_neutral.csv"))
negative = pd.read_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_augmented_user_reviews_negative.csv"))

positive = pd.read_csv(os.path.join(data_path, "cleaned_user_reviews.csv"))[["Review", "Sentiment"]]
label_mapping = {'Positive': 1, 'Neutral': 2, 'Negative': 0}
positive["Sentiment"] = positive["Sentiment"].map(label_mapping)
positive = positive.where(positive["Sentiment"] == 1)
positive.reset_index(drop=True, inplace=True)
positive.dropna(inplace=True)

upsampled_df = pd.concat([neutral, negative, positive])
upsampled_df["Sentiment"] = upsampled_df["Sentiment"].astype(int)
upsampled_df.info()
upsampled_df.to_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_upsampled_user_reviews.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 5657 entries, 0 to 2376
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     5657 non-null   object
 1   Sentiment  5657 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 132.6+ KB


### Examine and clean the Generated Data (with AugGPT)


In [31]:
df = pd.read_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_upsampled_user_reviews.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5657 entries, 0 to 5656
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     5657 non-null   object
 1   Sentiment  5657 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 88.5+ KB


In [32]:
# The number of records for each sentiment
label_mapping = {'Positive': 1, 'Neutral': 2, 'Negative': 0}
df["Sentiment"].value_counts()


Sentiment
0    2174
2    1783
1    1700
Name: count, dtype: int64

In [33]:
# Remove invalid sentiment: -1, 3, 4, 5
df = df[df["Sentiment"].isin([0, 1, 2])]
df["Sentiment"].value_counts()

Sentiment
0    2174
2    1783
1    1700
Name: count, dtype: int64

In [35]:
# Remove duplicates
df = df.drop_duplicates(subset=["Review"])
df.info()
df["Sentiment"].value_counts()

# Save the cleaned data
df.to_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_upsampled_user_reviews_cleaned.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4411 entries, 0 to 5656
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     4411 non-null   object
 1   Sentiment  4411 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 103.4+ KB


### Examine the quality of the generated data

In [38]:
upsampled_df = pd.read_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_upsampled_user_reviews.csv"))
upsampled_df["Sentiment"].value_counts()

Sentiment
0    2174
2    1783
1    1700
Name: count, dtype: int64

In [39]:
upsampled_df_cleaned = pd.read_csv(os.path.join(data_path, "llm_generated/gemini-2.0-flash", "auggpt_upsampled_user_reviews_cleaned.csv"))
upsampled_df_cleaned["Sentiment"].value_counts()

Sentiment
1    1656
0    1586
2    1169
Name: count, dtype: int64

In [41]:
# Compare the number of records removed after deduplication in percentage for each sentiment
upsampled_df_cleaned["Sentiment"].value_counts() / upsampled_df["Sentiment"].value_counts()


Sentiment
0    0.729531
1    0.974118
2    0.655637
Name: count, dtype: float64