In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vsmec = load_dataset("viethq1906/UIT-VSMEC-Sentiment-Relabelled")
vsfc = load_dataset("ura-hcmut/UIT-VSFC")

print(vsmec)
print(vsfc)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 5548
    })
    validation: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 686
    })
    test: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 693
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1584
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3166
    })
})


In [15]:
# Check unique values in each split before normalization
print("VSFC Train unique labels:", vsfc['train'].to_pandas()['label'].unique())
print("VSFC Valid unique labels:", vsfc['validation'].to_pandas()['label'].unique())
print("VSFC Test unique labels:", vsfc['test'].to_pandas()['label'].unique())

VSFC Train unique labels: ['positive' 'negative' 'neutral']
VSFC Valid unique labels: ['negative' 'positive' 'neutral' None]
VSFC Test unique labels: ['positive' 'negative' 'neutral']


In [17]:
vsmec_train = vsmec['train'].to_pandas()
vsfc_train = vsfc['train'].to_pandas()
vsmec_valid = vsmec['validation'].to_pandas()
vsfc_valid = vsfc['validation'].to_pandas()
vsmec_test = vsmec['test'].to_pandas()
vsfc_test = vsfc['test'].to_pandas()

# remove NaN
for df in [vsfc_train, vsfc_valid, vsfc_test]:
    df.dropna(subset=['text', 'label'], inplace=True)

# Check shape
for split_name, vsmec_split, vsfc_split in [
    ('train', vsmec_train, vsfc_train),
    ('validation', vsmec_valid, vsfc_valid),
    ('test', vsmec_test, vsfc_test),
]:
    print(f"{split_name} - VSMEC: {vsmec_split.shape}, VSFC: {vsfc_split.shape}")

# Normalize column names
vsfc_train = vsfc_train.rename(columns={'text': 'sentence', 'label': 'sentiment'})
vsfc_valid = vsfc_valid.rename(columns={'text': 'sentence', 'label': 'sentiment'})
vsfc_test = vsfc_test.rename(columns={'text': 'sentence', 'label': 'sentiment'})

# Check unique values before mapping
print("\nBefore mapping:")
print("Train unique:", vsfc_train['sentiment'].unique())
print("Valid unique:", vsfc_valid['sentiment'].unique())
print("Test unique:", vsfc_test['sentiment'].unique())

# Normalize sentiment labels
vsfc_train['sentiment'] = vsfc_train['sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})
vsfc_valid['sentiment'] = vsfc_valid['sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})
vsfc_test['sentiment'] = vsfc_test['sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})

# Check after mapping
print("\nAfter mapping:")
print("Train unique:", vsfc_train['sentiment'].unique(), "- dtype:", vsfc_train['sentiment'].dtype)
print("Valid unique:", vsfc_valid['sentiment'].unique(), "- dtype:", vsfc_valid['sentiment'].dtype)
print("Test unique:", vsfc_test['sentiment'].unique(), "- dtype:", vsfc_test['sentiment'].dtype)

# Check for NaN values
print("\nNaN counts:")
print("Train NaN:", vsfc_train['sentiment'].isna().sum())
print("Valid NaN:", vsfc_valid['sentiment'].isna().sum())
print("Test NaN:", vsfc_test['sentiment'].isna().sum())

print(vsfc_train.head())

# Concatenate datasets
merged_train = pd.concat([vsmec_train, vsfc_train], ignore_index=True)
merged_valid = pd.concat([vsmec_valid, vsfc_valid], ignore_index=True)
merged_test = pd.concat([vsmec_test, vsfc_test], ignore_index=True)

# Shuffle all datasets
merged_train = merged_train.sample(frac=1, random_state=42).reset_index(drop=True)
merged_valid = merged_valid.sample(frac=1, random_state=42).reset_index(drop=True)
merged_test = merged_test.sample(frac=1, random_state=42).reset_index(drop=True)

# Check merged shapes
for split_name, merged_split in [
    ('train', merged_train),
    ('validation', merged_valid),
    ('test', merged_test),
]:
    print(f"{split_name} - Merged: {merged_split.shape}")
    
print(merged_train.head())

# Save merged datasets to JSON
import json
import os

# Create res directory if it doesn't exist
res_dir = '../res'
os.makedirs(res_dir, exist_ok=True)

# Function to convert DataFrame to the required JSON format
def save_to_json(df, filepath):
    records = []
    for _, row in df.iterrows():
        record = {
            "sentence": row['sentence'],
            "sentiment": int(row['sentiment'])  # Convert to int to ensure proper format
        }
        records.append(record)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=4)
    
    print(f"Saved {len(records)} records to {filepath}")

# Save each split
save_to_json(merged_train, os.path.join(res_dir, 'train.json'))
save_to_json(merged_valid, os.path.join(res_dir, 'validation.json'))
save_to_json(merged_test, os.path.join(res_dir, 'test.json'))


train - VSMEC: (5548, 2), VSFC: (11426, 2)
validation - VSMEC: (686, 2), VSFC: (1583, 2)
test - VSMEC: (693, 2), VSFC: (3166, 2)

Before mapping:
Train unique: ['positive' 'negative' 'neutral']
Valid unique: ['negative' 'positive' 'neutral']
Test unique: ['positive' 'negative' 'neutral']

After mapping:
Train unique: [ 1 -1  0] - dtype: int64
Valid unique: [-1  1  0] - dtype: int64
Test unique: [ 1 -1  0] - dtype: int64

NaN counts:
Train NaN: 0
Valid NaN: 0
Test NaN: 0
                                            sentence  sentiment
0                          slide giáo trình đầy đủ .          1
1     nhiệt tình giảng dạy , gần gũi với sinh viên .          1
2               đi học đầy đủ full điểm chuyên cần .         -1
3  chưa áp dụng công nghệ thông tin và các thiết ...         -1
4  thầy giảng bài hay , có nhiều bài tập ví dụ ng...          1
train - Merged: (16974, 2)
validation - Merged: (2269, 2)
test - Merged: (3859, 2)
                                            sentence  sent