In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
dataset = load_dataset("tdavidson/hate_speech_offensive")


In [3]:
# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Keep only 'tweet' and 'class' columns
df = df[['tweet', 'class']]

# Convert class labels (class 2 -> 0, others -> 1)
df['class'] = df['class'].apply(lambda x: 0 if x == 2 else 1)

# First split: train + validation_test (80% / 20%)
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=RANDOM_SEED, 
    stratify=df['class']
)

# Second split: validation + test (50% / 50% of the remaining 20%)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=RANDOM_SEED, 
    stratify=temp_df['class']
)

# Save to CSV files
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

# Print dataset statistics
print("\nDataset Statistics:")
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Validation samples: {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")

# Print class distribution
print("\nClass Distribution:")
print("\nTraining set:")
print(train_df['class'].value_counts(normalize=True))
print("\nValidation set:")
print(val_df['class'].value_counts(normalize=True))
print("\nTest set:")
print(test_df['class'].value_counts(normalize=True)) 


Dataset Statistics:
Total samples: 24783
Training samples: 19826 (80.0%)
Validation samples: 2478 (10.0%)
Test samples: 2479 (10.0%)

Class Distribution:

Training set:
class
1    0.832039
0    0.167961
Name: proportion, dtype: float64

Validation set:
class
1    0.832123
0    0.167877
Name: proportion, dtype: float64

Test set:
class
1    0.831787
0    0.168213
Name: proportion, dtype: float64
