In [1]:
import pandas as pd
import pickle as pkl
from tqdm import tqdm
import sys
import os
import json

import argparse
from src.Dialects import AfricanAmericanVernacular
from src.Dialects import NigerianDialect
from src.Dialects import HongKongDialect
from src.Dialects import ColloquialSingaporeDialect
from src.Dialects import AfricanAmericanVernacular
from src.Dialects import ColloquialSingaporeDialect
from src.Dialects import IndianDialect

  from .autonotebook import tqdm as notebook_tqdm


## Davidson's Twitter data 

In [23]:
df = pd.read_csv('./davidson_og_all.csv', sep=',', index_col=False)

def transform_to_dialect(dialect,df,dialect_name):
    sents = []  # {text: ..., rules: [...]}

    for i in tqdm(range(1000), desc="Processing"):
        sent = df["tweet"][i]  # load original sentece

        sent_dict = {}
        sent_dict["text"] = dialect.convert_sae_to_dialect(sent)
        sent_dict["rules"] = list(set([i["type"] for i in dialect.executed_rules.values()]))

        sents.append(sent_dict)

    with open(f'{dialect_name}.jsonl', 'w') as outfile:
        for entry in sents:
            json.dump(entry, outfile)
            outfile.write('\n')

    return True

In [22]:
# load and run AAVE transform module, save results
aave = AfricanAmericanVernacular()
transform_to_dialect(dialect=aave, df=df, dialect_name="aave")

# load and run Nigerian dialect transform module, save results
ngd = NigerianDialect()
transform_to_dialect(dialect=ngd, df=df, dialect_name="nigerianD")

# load and run HongKong dialect transform module, save results
indd = IndianDialect()
transform_to_dialect(dialect=indd, df=df, dialect_name="indianD")

# load and run Singlish dialect transform module, save results
csgd = ColloquialSingaporeDialect()
transform_to_dialect(dialect=csgd, df=df, dialect_name="singlish")

Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:14<00:00, 68.72it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:15<00:00, 64.76it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:27<00:00, 36.80it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:23<00:00, 42.64it/s]


True

## HateXplain Twitter data

In [32]:
df = pd.read_json(f"./hatexplain_original.json").transpose()

In [29]:
def transform_to_dialect(dialect,df,dialect_name):
    sents = []  # {text: ..., rules: [...]}

    for i in tqdm(range(1000), desc="Processing"):
        sent = " ".join(df["post_tokens"][0])  # load original sentece

        sent_dict = {}
        sent_dict["text"] = dialect.convert_sae_to_dialect(sent)
        sent_dict["rules"] = list(set([i["type"] for i in dialect.executed_rules.values()]))

        sents.append(sent_dict)

    with open(f'{dialect_name}.jsonl', 'w') as outfile:
        for entry in sents:
            json.dump(entry, outfile)
            outfile.write('\n')

    return True

In [4]:
print(" ".join(df["post_tokens"][0]))
print(" ".join(df["post_tokens"][1]))
print(" ".join(df["post_tokens"][2]))
print(" ".join(df["post_tokens"][3]))

i dont think im getting my baby them white 9 he has two white j and nikes not even touched
we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum
nawt yall niggers ignoring me
<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani ðŸ¤” ðŸ¤” ðŸ¤”


In [30]:
# load and run AAVE transform module, save results
aave = AfricanAmericanVernacular()
transform_to_dialect(dialect=aave, df=df, dialect_name="aave")

# load and run Nigerian dialect transform module, save results
ngd = NigerianDialect()
transform_to_dialect(dialect=ngd, df=df, dialect_name="nigerianD")

# load and run HongKong dialect transform module, save results
indd = IndianDialect()
transform_to_dialect(dialect=indd, df=df, dialect_name="indianD")

# load and run Singlish dialect transform module, save results
csgd = ColloquialSingaporeDialect()
transform_to_dialect(dialect=csgd, df=df, dialect_name="singlish")

Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:13<00:00, 71.70it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:11<00:00, 90.64it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:26<00:00, 38.04it/s]
Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:29<00:00, 34.39it/s]


True