In [1]:
# Importing dependencies
import random
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Generating synthetic dataset

professions = {
    "male": ["engineer", "lawyer", "architect", "scientist"],
    "female": ["nurse", "teacher", "librarian", "receptionist"]
}

def generate_bio(gender: int, profession: str) -> str:
    pronoun = "He" if gender == 0 else "She"
    name = random.choice(["Alex", "Sam", "Jamie", "Taylor"])
    return f"{name} is a {profession}. {pronoun} has 5 years of experience."

def generate_dataset(n=1000, bias_ratio=0.9):
    data = []
    for _ in range(n):
        if random.random() < bias_ratio:
            # Correlate gender with stereotypical profession
            gender = random.randint(0, 1)
            prof = random.choice(professions["male" if gender == 0 else "female"])
        else:
            # Anti-stereotypical example
            gender = random.randint(0, 1)
            prof = random.choice(professions["female" if gender == 0 else "male"])

        bio = generate_bio(gender, prof)
        data.append({
            "bio": bio,
            "profession": prof,
            "gender": gender
        })
    return data

synthetic_data = generate_dataset(n=5000, bias_ratio=0.85)
hf_dataset = Dataset.from_list(synthetic_data)

In [23]:
import pandas as pd

# creating the dataframe object
df = []
for gender_data in list(hf_dataset):
    small_df = pd.DataFrame.from_dict({
        "bio": [gender_data['bio']],
        "gender": [gender_data['gender']]
    })
    df.append(small_df)
output_df = pd.concat(df)
output_df.head()

Unnamed: 0,bio,gender
0,Jamie is a librarian. She has 5 years of exper...,1
0,Taylor is a lawyer. He has 5 years of experience.,0
0,Sam is a scientist. He has 5 years of experience.,0
0,Taylor is a receptionist. She has 5 years of e...,1
0,Taylor is a architect. He has 5 years of exper...,0


In [25]:
# saving dataframe into csv file

file_name = "biased_gender_data_synthetic.csv"
output_df.to_csv(file_name, index=False)