In [6]:
import pandas as pd
import numpy as np

data = pd.read_csv('../data/labeled-data.csv')

# Print the number of rows by label
print(data['label'].value_counts())

label
1.0    1452
0.0    1261
3.0    1030
2.0    1010
7.0     600
4.0     529
5.0     502
6.0     501
Name: count, dtype: int64


In [14]:
train_num = [900, 900, 810, 830, 379, 352, 351, 400]
val_num   = [100, 100, 100, 100,  50,  50,  50,  50]
test_num  = [100, 100, 100, 100, 100, 100, 100, 100]

print("Sum by label:")
for i in range(8):
    print(f"Label {i}: {train_num[i] + val_num[i] + test_num[i]}")

Sum by label:
Label 0: 1100
Label 1: 1100
Label 2: 1010
Label 3: 1030
Label 4: 529
Label 5: 502
Label 6: 501
Label 7: 550


In [15]:
# For each label, create train, validation, and test sets based on the counts above
def split_data(df, train_counts, val_counts, test_counts):
    train_dfs = []
    val_dfs = []
    test_dfs = []

    for label in df['label'].unique():
        label_df = df[df['label'] == label]
        train_size = train_counts[int(label)]
        val_size = val_counts[int(label)]
        test_size = test_counts[int(label)]

        train_df = label_df.sample(n=train_size, random_state=42)
        remaining = label_df.drop(train_df.index)
        val_df = remaining.sample(n=val_size, random_state=42)
        test_df = remaining.drop(val_df.index).sample(n=test_size, random_state=42)

        train_dfs.append(train_df)
        val_dfs.append(val_df)
        test_dfs.append(test_df)

    return pd.concat(train_dfs), pd.concat(val_dfs), pd.concat(test_dfs)

train_df, val_df, test_df = split_data(data, train_num, val_num, test_num)

In [16]:
# Oversampling the minority classes in the training set, targer: 900 samples per class
def oversample_minority_classes(train_df, target_size):
    oversampled_dfs = []
    for label in train_df['label'].unique():
        label_df = train_df[train_df['label'] == label]
        if len(label_df) < target_size:
            # Oversample the minority class
            oversampled_label_df = label_df.sample(n=target_size, replace=True, random_state=42)
        else:
            oversampled_label_df = label_df
        oversampled_dfs.append(oversampled_label_df)

    return pd.concat(oversampled_dfs)

train_df = oversample_minority_classes(train_df, 900)
# Print the number of rows in each label after oversampling
print("Number of rows in each label after oversampling:")
print(train_df['label'].value_counts())


Number of rows in each label after oversampling:
label
2.0    900
0.0    900
6.0    900
1.0    900
4.0    900
3.0    900
7.0    900
5.0    900
Name: count, dtype: int64


In [17]:
val_df = oversample_minority_classes(val_df, 100)
# Print the number of rows in each label after validation oversampling
print("Number of rows in each label after validation oversampling:")
print(val_df['label'].value_counts())

Number of rows in each label after validation oversampling:
label
2.0    100
0.0    100
6.0    100
1.0    100
4.0    100
3.0    100
7.0    100
5.0    100
Name: count, dtype: int64


In [18]:
# Save
train_df.to_csv('./processed-data/train.csv', index=False)
val_df.to_csv('./processed-data/val.csv', index=False)
test_df.to_csv('./processed-data/test.csv', index=False)