# Workplace Dataset

In [15]:
from datasets import load_dataset, DatasetDict
from datasets import ClassLabel

raw_datasets = load_dataset("lidiapierre/fr_sexism_labelled")['train']

raw_datasets = raw_datasets.rename_column('Label','label_sexist')
raw_datasets = raw_datasets.rename_column('Sentences','text')
raw_datasets = raw_datasets.remove_columns('Unnamed: 0')

new_features = raw_datasets.features.copy()
new_features['label_sexist'] = ClassLabel(names=["not sexist", "sexist"])
raw_datasets = raw_datasets.cast(new_features)

split_dataset = raw_datasets.train_test_split(0.1,stratify_by_column='label_sexist')
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
split_again = train_dataset.train_test_split(0.1,stratify_by_column='label_sexist')
train_dataset = split_again['train']
validation_dataset = split_again['test']

In [18]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset,
})
print(dataset_dict['train'].features)

{'text': Value(dtype='string', id=None), 'label_sexist': ClassLabel(names=['not sexist', 'sexist'], id=None), 'fr_sentences': Value(dtype='string', id=None)}


In [20]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label_sexist', 'fr_sentences'],
        num_rows: 920
    })
    validation: Dataset({
        features: ['text', 'label_sexist', 'fr_sentences'],
        num_rows: 103
    })
    test: Dataset({
        features: ['text', 'label_sexist', 'fr_sentences'],
        num_rows: 114
    })
})

In [22]:
import json

file_path = "credentials.json"

with open(file_path, 'r') as json_file:
    token_data = json.load(json_file)

token = token_data.get("huggingface_token")
if token:
    print(f"Token read successfully")
else:
    print("Error: Token not found in the JSON file.")
    
import huggingface_hub

huggingface_hub.login(token)

dataset_dict.push_to_hub("yangezheng/lidiapierre-fr_sexism_labelled")

Token read successfully
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/yzheng/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# CMSB Dataset

In [91]:
from datasets import load_dataset

cmsb_dataset = load_dataset("csv", data_files="sexism_data.csv")
cmsb_dataset = cmsb_dataset.remove_columns(['id','dataset','of_id'])
cmsb_dataset = cmsb_dataset.rename_column('sexist','label_sexist')

dataset = cmsb_dataset['train'].class_encode_column("label_sexist")
print(dataset.features)



{'text': Value(dtype='string', id=None), 'toxicity': Value(dtype='float64', id=None), 'label_sexist': ClassLabel(names=['False', 'True'], id=None)}


In [92]:

from datasets import ClassLabel, Value

new_features = dataset.features.copy()
new_features['label_sexist'] = ClassLabel(names=["not sexist", "sexist"])
new_dataset = dataset.cast(new_features)

split_dataset = new_dataset.train_test_split(0.1,stratify_by_column='label_sexist')
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
split_again = train_dataset.train_test_split(0.1,stratify_by_column='label_sexist')
train_dataset = split_again['train']
validation_dataset = split_again['test']

In [93]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset,
})
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'toxicity', 'label_sexist'],
        num_rows: 11040
    })
    validation: Dataset({
        features: ['text', 'toxicity', 'label_sexist'],
        num_rows: 1227
    })
    test: Dataset({
        features: ['text', 'toxicity', 'label_sexist'],
        num_rows: 1364
    })
})


In [94]:
import json

file_path = "credentials.json"

with open(file_path, 'r') as json_file:
    token_data = json.load(json_file)

token = token_data.get("huggingface_token")
if token:
    print(f"Token read successfully")
else:
    print("Error: Token not found in the JSON file.")
    
import huggingface_hub

huggingface_hub.login(token)

dataset_dict.push_to_hub("yangezheng/CMSB")

Token read successfully
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/yzheng/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/663 [00:00<?, ?B/s]

# Upload to HuggingFace

In [6]:
import json

file_path = "credentials.json"

with open(file_path, 'r') as json_file:
    token_data = json.load(json_file)

token = token_data.get("huggingface_token")
if token:
    print(f"Token read successfully")
else:
    print("Error: Token not found in the JSON file.")

Token read successfully


In [7]:
import huggingface_hub

huggingface_hub.login(token)

dataset_dict.push_to_hub("yangezheng/tum-nlp-sexism-socialmedia-balanced")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/yzheng/.cache/huggingface/token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]