In [1]:
import requests
import json
import pandas as pd
from openai import OpenAI
import os
from tqdm import tqdm
import random

# Load StereoSet dev.json file

In [2]:
stereo_url = "https://raw.githubusercontent.com/moinnadeem/StereoSet/master/data/dev.json"
response = requests.get(stereo_url)
stereo_json = json.loads(response.text)

# Combine intrasentence and intersentence data

In [3]:
data = stereo_json['data']['intrasentence'] + stereo_json['data']['intersentence']

# Filter for gender bias_type (primary)

In [4]:
gender_data = [item for item in data if item['bias_type'] == 'gender']

In [5]:
len(gender_data)

497

# Filter for profession bias_type (to combine, as it often intersects with gender stereotypes in occupations)

In [6]:
profession_data = [item for item in data if item['bias_type'] == 'profession']

In [7]:
len(profession_data)

1637

# Gender keywords to filter profession domain for gender-related stereotypes (e.g., texts implying male/female in occupations)

In [8]:
gender_keywords = [
    'he', 'she', 'him', 'her', 'his', 'hers', 'man', 'men', 'woman', 'women', 'male', 'males', 'female', 'females', 'boy', 'boys', 'girl', 'girls', 'gentleman', 'gentlemen', 'lady', 'ladies', 'sir', 'madam', 'mr', 'mrs', 'ms', 'miss', 'guy', 'guys', 'gal', 'gals', 'dude', 'dudes', 'chick', 'chicks'
]

# Expand gender data to labeled texts (all, no profession filter to maximize)

In [None]:
gender_rows = []
for item in tqdm(gender_data, desc="Processing StereoSet gender data"):
    context = item['context']
    
    placeholder = '[BLANK]' if '[BLANK]' in context else 'BLANK' if 'BLANK' in context else None
    for sentence in item['sentences']:
        if placeholder:
            text = context.replace(placeholder, sentence['sentence']).strip()  
        else:
            text = context + ' ' + sentence['sentence']
        
        label = 1 if sentence['gold_label'] == 'stereotype' else 0
        gender_rows.append({'text': text, 'label': label})

Processing StereoSet gender data: 100%|██████████| 497/497 [00:00<00:00, 83997.63it/s]


In [10]:
len(gender_rows)

1491

# Expand profession data and filter for gender implications

In [None]:
profession_rows = []
for item in tqdm(profession_data, desc="Processing StereoSet profession data"):
    context = item['context']
    
    placeholder = '[BLANK]' if '[BLANK]' in context else 'BLANK' if 'BLANK' in context else None
    for sentence in item['sentences']:
        if placeholder:
            text = context.replace(placeholder, sentence['sentence']).strip()
        else:
            text = context + ' ' + sentence['sentence']
        
        
        if any(keyword.lower() in text.lower() for keyword in gender_keywords):
            label = 1 if sentence['gold_label'] == 'stereotype' else 0
            profession_rows.append({'text': text, 'label': label})

Processing StereoSet profession data: 100%|██████████| 1637/1637 [00:00<00:00, 78587.99it/s]


In [12]:
len(profession_rows)

4583

# Combine rows from both domains

In [13]:
all_rows = gender_rows + profession_rows

In [14]:
len(all_rows)

6074

# Create Pandas DataFrame

In [15]:
stereo_df = pd.DataFrame(all_rows)

# remove duplicates with exact matches

In [16]:
stereo_df = stereo_df.drop_duplicates(subset=['text'], keep='first')

In [17]:
len(stereo_df)

6063

# Balance the classes,Oversample biased (label=1) to match or exceed unbiased (label=0)

In [18]:
label_counts = stereo_df['label'].value_counts()
unbiased_count = label_counts.get(0, 0)
biased_count = label_counts.get(1, 0)

# Oversample biased rows

In [None]:
if biased_count < unbiased_count:
    
    biased_df = stereo_df[stereo_df['label'] == 1]
    oversample_size = unbiased_count - biased_count
    oversampled_biased = biased_df.sample(n=oversample_size, replace=True, random_state=42)  # Replace=True for duplication if needed
    stereo_df = pd.concat([stereo_df, oversampled_biased], ignore_index=True)

In [20]:
stereo_df = stereo_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(len(stereo_df))

8062


# Since we are looking for gender stereotypes where the original count was around 505 , lets sample it to a closer value of 516.
# If more than 516, sample down to ~516 while preserving balance

In [21]:
if len(stereo_df) > 516:
    # Stratified sample to maintain ratio
    stereo_df = stereo_df.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=516/len(stereo_df))).reset_index(drop=True)

  stereo_df = stereo_df.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=516/len(stereo_df))).reset_index(drop=True)


In [22]:
stereo_df['label'].value_counts()

label
0    258
1    258
Name: count, dtype: int64

In [None]:
stereo_df.to_csv('stereoset_data.csv', index=False)