In [4]:
import json
import pandas as pd
import requests
from io import StringIO

def download_dataset(task, dataset_type="raw"):
    """
        Inputs:
            - task (string): Task name (either "social-acceptability" or "toxicity")
        Outputs:
            - Downloads the dataset corresponding to the task name from the LabintheWild API.
    """
    url = "https://{}-litw.apps.allenai.org/api/v1/dataset?type={}".format(
        task if task == nlpositionality.TOXICITY else "delphi",
        dataset_type)
    response = requests.get(url)

    df = pd.read_csv(StringIO(response.text), sep=",")
    df.to_csv('./data/nlpositionality_{}_{}.csv'.format(task, dataset_type), index=False)

def process_litw_data(old_df):
    """
        Inputs:
            - old_df (DataFrame): Joined dataset with raw demographic values, LabintheWild and dataset annotations, and model predictions
        Outputs:
            - df (DataFrame): Joined dataset with processed demographic values for analysis
    """
    df = old_df.copy()
    df['age'] = df['age'].apply(__process_age__)
    df['gender'] = df['gender'].apply(__process_gender__)
    df['ethnicity'] = df['ethnicity'].apply(__process_ethnicities__)
    df['religion'] = df['religion'].apply(__process_religion__)
    df['education'] = df['education'].apply(__process_education__)
    df['native_language'] = df['native_language'].apply(lambda x: __filter_language_english__(__process_language__(x)))
    df['country_longest'] = df['country_longest'].apply(lambda x: __filter_country_sphere__(__process_country__(x)))
    df['country_residence'] = df['country_residence'].apply(lambda x: __filter_country_sphere__(__process_country__(x)))

    return df

def __process_age__(val):
    """
        Inputs:
            - val (int): age value to be processed
        Outputs:
            - (string): age group the value belongs to; "None" if the value is null
    """
    if pd.isnull(val):
        return 'None'
    if val >= 10 and val < 20:
        return '10-20'
    if val >= 20 and val < 30:
        return '20-30'
    if val >= 30 and val < 40:
        return '30-40'
    if val >= 40 and val < 50:
        return '40-50'
    if val >= 50 and val < 60:
        return '50-60'
    if val >= 60 and val < 70:
        return '60-70'
    if val >= 70 and val < 80:
        return '70-80'
    if val >= 80:
        return '> 80'
    return 'None'

def __process_gender__(val):
    """
        Inputs:
            - val (string): gender value to be processed
        Outputs:
            - (string): gender category the value belongs to ("man", "woman", "non-binary"); "None" if the value is null or not in one of the three categories
    """

    if val == 'man':
        return 'man'
    if val == 'woman':
        return 'woman'
    if val == 'non-binary':
        return 'non-binary'

    #removing nans before string operations
    if pd.isnull(val):
        return 'None'

    #we group them in non-binary for now
    if 'agender' in val.lower():
        return 'non-binary'
    if 'genderfluid' in val.lower():
        return 'non-binary'

    #otherwise none, includes random fillings and no response
    return 'None'

def __process_ethnicities__(val):
    """
        Inputs:
            - val (string): ethnicity value to be processed
        Outputs:
            - (string): ethnicity category the value belongs to; "None" if the value is null or not in one of the predefined categories
    """
    stored = ['white', 'asian, asian american', 'pacific islander, native australian', 'black, african american', 'latino/latina, hispanic', 'mixed', 'Arab-american', 'native american, american indian, alaska native']
    if val in stored:
        return val.split(',')[0].lower()
    return "None"

def __process_religion__(val):
    """
        Inputs:
            - val (string): religion value to be processed
        Outputs:
            - (string): religion category the value belongs to; "None" if the value is null or not in one of the predefined categories
    """

    if pd.isnull(val):
        return 'None'

    if val.lower() in ["roman catholic", "protestant", "orthodox", "christian", "baptist"] or "christian" in val.lower():
        return "christian"

    if val.lower() in ["agnostic theist", "spiritual", "paganism"]:
        return "spiritual"

    if val.lower() in ["hindu", "buddhist", "muslim", "jew"]:
        return val
    return "None"

def __process_education__(val):
    """
        Inputs:
            - val (string): education value to be processed
        Outputs:
            - (string): education category the value belongs to; "None" if the value is null or not in one of the predefined categories
    """
    if pd.isnull(val):
        return 'None'

    stored = ["college", "high school", "graduate school", "phd", "professional school", "pre-high school"]

    if val.lower() in stored:
        return val

    return "None"

def __process_language__(val):
    """
        Inputs:
            - val (string): language value to be processed
        Outputs:
            - (string): language value if it is not null; "None" if the value is null
    """
    if pd.isnull(val):
        return 'None'
    return val

def __filter_language_english__(val):
    """
        Inputs:
            - val (string): language value to be processed
        Outputs:
            - (string): language category the value belongs to ("english", "not english"); "None" if the value is null or not in one of the predefined categories
    """
    if val == 'None':
        return val

    if val == 'English':
        return 'english'

    return 'not english'

def __process_country__(val):
    """
        Inputs:
            - val (string): country value to be processed
        Outputs:
            - (string): country value if it is not null; "None" if the value is null
    """
    if pd.isnull(val):
        return 'None'
    return val

def __filter_country_sphere__(val):
    """
        Inputs:
            - val (string): country value to be processed
        Outputs:
            - (string): country category the value belongs to based on cultural spheres; "None" if the value is null or not in one of the predefined categories
    """
    spheres = json.load(open('./data/spheres.json'))
    if val not in spheres:
        return 'None'
    return spheres[val]

In [5]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # Supress pandas warnings
from scipy import stats
from statsmodels.stats.multitest import multipletests

SOCIAL_ACCEPTABILITY = "social-acceptability"
TOXICITY = "toxicity"

SOCIAL_CHEM = "socialchem"
DELPHI = "delphi"
GPT4 = "gpt4"
DYNAHATE = "dynahate"
HATEROBERTA = "hateroberta"
PERSPECTIVE_API = "perspective"
REWIRE = "rewire"

def get_pearson_rs(task, model_or_dataset_name, dataset_type="raw"):
    """
        Inputs:
            - task (string): Task name (either "social acceptability" or "toxicity")
            - model_or_dataset_name (string): Name of a model or dataset (either "socialchem", "delphi", "gpt4", "dynahate",
              "hateroberta", "perspective", or "rewire")
        Outputs:
            - pearson_rs (DataFrame): DataFrame representing the Pearson's r coefficients and p-values between the dataset
              labels/model scores and LabintheWild volunteer annotations by demographic.

    """
    is_valid_social_acceptability = task == SOCIAL_ACCEPTABILITY and model_or_dataset_name in [SOCIAL_CHEM, DELPHI, GPT4]
    is_valid_toxicity = task == TOXICITY and model_or_dataset_name in [GPT4, DYNAHATE, HATEROBERTA, PERSPECTIVE_API, REWIRE]

    # Processed dataset is already downloaded via the API
    if dataset_type == "processed" and (is_valid_toxicity or is_valid_social_acceptability):
        df = pd.read_csv('data/nlpositionality_{}_processed.csv'.format(task))
    elif dataset_type == "raw" and (is_valid_toxicity or is_valid_social_acceptability):
        df = pd.read_csv('data/nlpositionality_{}_raw.csv'.format(task))
        df = utils.process_litw_data(df)
    else:
        raise ValueError('Invalid task name or model or dataset name')

    results = {}
    pvalues = []
    for c in ['country_longest', 'education', 'ethnicity', 'gender', 'native_language', 'age', 'country_residence', 'religion']:
        demo = list(df[c].unique()) # Get all the demographics under a category
        demo.sort()

        if 'None' in demo:
            demo.remove('None')

        if 'mixed' in demo:
            demo.remove('mixed')

        if 'arab-american' in demo:
            demo.remove('arab-american')

        for d in demo:
            ndf = df[df[c] == d] # Get all the instances from a demographic group
            dndf = __mean_df__(ndf, model_or_dataset_name) # Average responses in a demographic group
            r, p = stats.pearsonr(dndf['litw'], dndf[model_or_dataset_name]) # Compute Pearson R values

            results[c + '_' + d] = r
            pvalues.append(p)

    assert(len(results) == len(pvalues))

    # Apply Berforroni stepwise correction
    alpha = 0.001
    hypotheses, pvalues, _, new_alpha = multipletests(pvalues, alpha, method='bonferroni', is_sorted=False, returnsorted=False)

    data = []
    for key, p, h in zip(results.keys(), pvalues, hypotheses):
        # Convert p-values and Pearson's rs to strings
        p = str(p)

        value = str(round(results[key], 2))
        if len(value) == 3:
            value += "0"
        value = value + '' + ('*' if h == True else '')

        data.append({
            "demographic": key,
            "pearson's r": value,
            "p-value": p
        })

    pearson_rs = pd.DataFrame(data=data) # Convert the data to a DataFrame
    return pearson_rs

def __mean_df__(df, model_or_dataset_name):
    ddf = df.groupby(['action']).mean()[['litw', model_or_dataset_name]].reset_index()
    ddf['litw'] = ddf['litw'].apply(lambda x: round(x))
    return ddf

In [None]:
experiments = [
    (SOCIAL_ACCEPTABILITY, SOCIAL_CHEM),
    (SOCIAL_ACCEPTABILITY, DELPHI),
    (SOCIAL_ACCEPTABILITY, GPT4),
    (TOXICITY, DYNAHATE),
    (TOXICITY, PERSPECTIVE_API),
    (TOXICITY, REWIRE),
    (TOXICITY, HATEROBERTA),
    (TOXICITY, GPT4),
]

def run_experiments(dataset_type):
    download_dataset(SOCIAL_ACCEPTABILITY, dataset_type=dataset_type)
    download_dataset(TOXICITY, dataset_type=dataset_type)

    for task, dataset_or_model in experiments:
        print(task, dataset_or_model)
        df = get_pearson_rs(task, dataset_or_model, dataset_type=dataset_type)
        print(df)

if __name__ == "__main__":
    dataset_type = "raw" # Dataset with raw demographic values
    # dataset_type = "processed" # Dataset with processed demographic values
    run_experiments(dataset_type)