In [1]:
cd ..

/Users/yasas/Documents/Projects/textkit-learn


In [2]:
from tklearn.datasets import load_dataset

from tklearn.utils import cache

# cache.clear()

In [3]:
dset = load_dataset('json', path='notebooks/data/demographic_category_hate_corpora.jsonl')

dset

Path,Format,#Rows,#Columns
/Users/yasas/.tklearn/cache/dataset-mapped-f31ad6af507cc043ab4b3b6f48b4d101,arrow,276872,6

Unnamed: 0,grouping,fold,text,target_groups,dataset,hate
0,race/ethnicity,train,"RIP, Captain. My condolences to his wife, fam...",[],civilcomments,False
1,race/ethnicity,train,There's nothing wrong with sex between two con...,[],sbic,False
2,race/ethnicity,train,Ultimately a cis gay man (like me for the reco...,"[men, transgender men, transgender women, wome...",kennedy2020,False
3,race/ethnicity,train,There most certainly are some words that I dou...,[],civilcomments,False
4,race/ethnicity,train,<user> <user> <user> <user> <user> <user> a vo...,[],hatexplain,False


In [4]:
import re

from collections import Counter

In [5]:
target_group_count = Counter()
target_group_map = {}

for row in dset:
    for surface_target_group in row['target_groups']:
        target_group = surface_target_group.replace('folks', 'people')
        target_group = re.sub(r'[^a-zA-Z0-9]', '_', target_group)
        target_group = re.sub(r'_+', '_', target_group)
        target_group_count.update((target_group,))
        if surface_target_group not in target_group_map:
            target_group_map[surface_target_group] = target_group

In [6]:
import pandas as pd

In [7]:
common_target_groups = pd.DataFrame(target_group_count.most_common(50)).iloc[:, 0].tolist()

In [8]:
def preprocess(doc):
    target_group_set = set(doc['target_groups'])
    doc_target_groups = dict()
    for target_group, cid in target_group_map.items():
        if cid in doc_target_groups:
            continue
        if target_group in target_group_set:
            doc_target_groups[cid] = True
        elif cid not in doc_target_groups:
            doc_target_groups[cid] = False
    text_input = doc['text']
    return {
        'id': hash(text_input),
        'text': text_input,
        'split': {
            'train': doc['fold'] == 'train',
            'test': doc['fold'] == 'test',
        },
        'label': doc['hate'],
        'target_group': doc_target_groups,
        '_source': doc
    }


docs = dset.map(preprocess)

In [9]:
from typing import List

from functools import partial

In [10]:
def extract_stream_order(doc, target_groups: List[str]):
    stream_order = 0  # no target groups
    all_tgt_grps = set([key for key, value in doc['target_group'].items() if value])
    for i, target_group in enumerate(target_groups):
        if target_group in all_tgt_grps:
            stream_order = max(stream_order, i + 1)
            all_tgt_grps = all_tgt_grps - {target_group}
    if len(all_tgt_grps) > 0:
        stream_order = -1
    doc['stream_order'] = stream_order
    return doc


stream = docs.map(
    partial(
        extract_stream_order,
        target_groups=common_target_groups
    ),
    batched=False,
)

In [11]:
import polars as pl

In [12]:
def sample(lf):
    sample_size_expr = (
        pl.int_range(500, 2000)
        .shuffle()
        .first()
        .over('stream_order')
        .alias('sample_size')
    )
    random_order_expr = (
        pl.int_range(0, pl.count())
        .shuffle()
        .over('stream_order')
        .alias('random')
    )
    sample_expr = random_order_expr <= sample_size_expr
    exclude_no_target_expr = pl.col('stream_order') != 0
    lf = lf.filter(sample_expr)
    lf = lf.filter(exclude_no_target_expr)
    return lf.sort(pl.col('stream_order'))


sampled_stream = stream.map(sample, mode='polars')

In [13]:
import pyarrow.compute as pc

In [14]:
sampled_stream.filter(pc.field('stream_order') == -1)

Path,Format,#Rows,#Columns
/Users/yasas/.tklearn/cache/dataset-mapped-a307f7bc92921e2ffd62e6eaa4663c1e,arrow,1774,7

Unnamed: 0,id,text,split,label,target_group,_source,stream_order
0,8767052579124479918,JESUS is\nTHE WORD \nTHE WORD\nof GOD\nthat cr...,"{'test': True, 'train': False}",False,"{'_urban_minorities_': False, 'aboriginals': F...","{'dataset': 'sbic', 'fold': 'test', 'grouping'...",-1
1,-5283522515298229097,When I make love with my girlfriend she says r...,"{'test': True, 'train': False}",True,"{'_urban_minorities_': False, 'aboriginals': F...","{'dataset': 'sbic', 'fold': 'test', 'grouping'...",-1
2,3025225533346006765,What did the Pulse Nightclub shooter say to th...,"{'test': True, 'train': False}",True,"{'_urban_minorities_': False, 'aboriginals': F...","{'dataset': 'sbic', 'fold': 'test', 'grouping'...",-1
3,-7862002707833869562,What's the difference between a pregnant woman...,"{'test': True, 'train': False}",True,"{'_urban_minorities_': False, 'aboriginals': F...","{'dataset': 'sbic', 'fold': 'test', 'grouping'...",-1
4,-7629227041142280643,"I'm fine with gays, and I'm fine with lesbians...","{'test': True, 'train': False}",True,"{'_urban_minorities_': False, 'aboriginals': F...","{'dataset': 'sbic', 'fold': 'test', 'grouping'...",-1


In [15]:
df = sampled_stream.to_polars()

In [16]:
from tqdm import auto as tqdm

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit as ShuffleSplit

from transformers import AutoModelForSequenceClassification, AutoTokenizer

from tklearn import datasets
from tklearn.metrics import TextClassificationMetric
from tklearn.nn.trainer import Trainer
from tklearn.nn.evaluator import Evaluator
from tklearn.nn.callbacks import ProgbarLogger
from tklearn.config import config, config_scope

In [18]:
import pandas as pd
from tqdm import auto as tqdm
from sklearn.metrics import roc_auc_score


def evaluate_classes(target_group_test_df, y_score, y_true):
    results_table = []
    for target_group in tqdm.tqdm(target_group_test_df.columns):
        target_group_idx = target_group_test_df[target_group]
        target_group_y_score = y_score[target_group_idx]
        target_group_y_pred = target_group_y_score >= 0.5
        target_group_y_true = y_true[target_group_idx]
        if len(target_group_y_true) == 0:
            continue
        if len(target_group_y_true.unique()) != 2:
            continue
        auc = roc_auc_score(target_group_y_true, target_group_y_score)
        num_samples = target_group_y_true.sum()
        results_table.append([target_group, num_samples, auc])
    return pd.DataFrame(
        results_table, columns=['target_group', 'num_samples', 'auc']
    ).sort_values('num_samples', ascending=False)

In [19]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize(s):
    return pd.Series(tokenizer(s, padding="max_length", truncation=True))

In [20]:
import numpy as np

In [21]:
max_step = df.select('stream_order').max().collect().item()

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type='multi_label_classification',
    num_labels=1,
)

open('notebooks/logs/hate-speech-detection-lml-output-v1.log', 'w').close()

for step in tqdm.trange(1,  max_step + 1, desc='Streaming'):
    train_df = df.filter(pl.col('stream_order') == step).filter(pl.col('split').struct.field('train')).collect()
    train_df = train_df.to_pandas()
    train_df['labels'] = train_df['label'].astype(np.float32)
    train_idx, valid_idx = next(ShuffleSplit().split(train_df, train_df.labels))
    valid_df = train_df.iloc[valid_idx]
    valid_input_df = valid_df['text'].apply(tokenize)
    valid_input_df['labels'] = valid_df['labels'].apply(lambda x: [x])
    valid_input_df = valid_input_df.reset_index(drop=True)
    train_df = train_df.iloc[train_idx]
    train_input_df = train_df['text'].apply(tokenize)
    train_input_df['labels'] = train_df['labels'].apply(lambda x: [x])
    train_input_df = train_input_df.reset_index(drop=True)
    groups_df = pl.from_pandas(train_df).select(['target_group']).unnest('target_group').to_pandas()
    evaluator = Evaluator(valid_input_df, metric=TextClassificationMetric(num_labels=1), postprocessor='binary', groups=groups_df)
    trainer = Trainer(model, batch_size=8, callbacks=[ProgbarLogger()])
    del model
    trainer.fit(train_input_df, evaluator=evaluator)
    # Test
    test_df = df.filter(pl.col('stream_order') == step).filter(pl.col('split').struct.field('test')).collect().to_pandas()
    test_df['labels'] = test_df['label'].astype(np.float32)
    test_input_df = test_df['text'].apply(tokenize)
    test_input_df['labels'] = test_df['labels'].apply(lambda x: [x])
    test_input_df = test_input_df.reset_index(drop=True)
    groups_df = pl.from_pandas(test_df).select(['target_group']).unnest('target_group').to_pandas()
    evaluator = Evaluator(test_input_df, metric=TextClassificationMetric(num_labels=1), postprocessor='binary', groups=groups_df)
    eval_report = evaluator.evaluate(trainer)
    eval_report['step'] = step
    with open('notebooks/logs/hate-speech-detection-lml-output-v1.log', 'a') as out:
        out.write(json.dumps(eval_report) + '\n')
    model = trainer.model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Streaming:   0%|          | 0/50 [00:00<?, ?it/s]

Train:   0%|          | 0/240 [00:00<?, ?it/s]

Predict:   0%|          | 0/9 [00:00<?, ?it/s]

ValueError: Evaluation module cache file doesn't exist. Please make sure that you call `add` or `add_batch` at least once before calling `compute`.