In [1]:
import json

with open("/mnt/block-volume/root/reddit_comments_subreddit_indonesia_RC_2020-01-2023-09.json") as f:
    raw_data = json.load(f)

In [2]:
len(raw_data)

4735944

In [3]:
columns = ["author", "created_utc", "score", "permalink", "subreddit"]

In [4]:
sentences_data = []

for datum in raw_data:
    # split multi-sentence
    body = datum["body"]
    sentences = body.split("\n")
    for sentence in sentences:
        obj = {k: datum[k] for k in columns}
        text = sentence.strip()
        if len(text) > 0:
            obj['body'] = text
            sentences_data.append(obj)

In [5]:
from ftlangdetect import detect
from tqdm.auto import tqdm

# indonesian, javanese, minangkabau, malaysian, sundanese
valid_langs = ["id", "jv", "min", "ms", "su"]
indonesia_data = []

for datum in tqdm(sentences_data):
    lang = detect(datum["body"])['lang']
    if lang in valid_langs:
        datum['lang_fastText'] = lang
        indonesia_data.append(datum)

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/7066715 [00:00<?, ?it/s]

100%|██████████| 7066715/7066715 [02:07<00:00, 55554.78it/s]


In [6]:
len(indonesia_data)

3789839

In [7]:
import re

def has_sarcasm_tag(text: str) -> bool:
    # ends with either one of these sarcasm tags
    pattern = r'(?<!\S)(?:/s|//s|/sarcasm|//sarcasm|\\s|\\\\s|\\sarcasm|\\\\sarcasm)$'
    matches = re.findall(pattern, text)
    return matches if matches else None

In [8]:
cleaned_data = []

for datum in indonesia_data:
    body = datum["body"].replace('\x00', '')
    datum["body"] = body
    tags = has_sarcasm_tag(body)
    if tags:
        # make `text` column clean; remove sarcasm tags
        for tag in tags:
            body = body.replace(tag, "")

    datum["label"] = 1 if tags else 0
    datum["text"] = body.strip()
    cleaned_data.append(datum)

In [9]:
from lsh import minhash, cache

# use minHash LSH algorithm to find near duplicates
hasher = minhash.MinHasher(seeds=100, char_ngram=4, hashbytes=8, random_state=42)
lsh_cache = cache.Cache(num_bands=20, hasher=hasher)
neardup_ids = []

# hash every text
texts = [d['text'] for d in cleaned_data]
for idx, text in enumerate(tqdm(texts)):
    lsh_cache.add_fingerprint(hasher.fingerprint(text), idx)

# find bins of duplicates
for cache_bin in lsh_cache.bins:
    for bucket_id in cache_bin:
        if len(cache_bin[bucket_id]) > 1:
            # add ids of neardup texts
            neardup_ids.append(cache_bin[bucket_id])

100%|██████████| 3789839/3789839 [10:58<00:00, 5755.09it/s] 


In [10]:
sorted_sets = sorted(neardup_ids, key=lambda x: min(x))

merged_sets = []
current_merged_set = sorted_sets[0]

for s in sorted_sets[1:]:
    # if has overlapping element
    if any(x in current_merged_set for x in s):
        current_merged_set.update(s)  # merge overlapping sets
    else:
        merged_sets.append(current_merged_set)
        current_merged_set = s  # start a new merged set

# add last set
merged_sets.append(current_merged_set)

In [11]:
drop_ids = set()

# for each "cluster", only keep first and drop the rest
for cluster in merged_sets:
    drop_ids |= set(list(cluster)[1:])

In [12]:
deduplicated_data = [d for i, d in enumerate(cleaned_data) if i not in drop_ids]

In [13]:
from collections import Counter

Counter([d['label'] for d in deduplicated_data])

Counter({0: 2616335, 1: 3529})

In [14]:
import re

def mask_reddit_comments(comment):
    # Mask usernames with <username>
    comment = re.sub(r'/u/[\w]+', '<username>', comment)
    # Mask hashtags with <hashtag>
    comment = re.sub(r'#[\w]+', '<hashtag>', comment)
    # Mask email addresses with <email>
    comment = re.sub(r'\b[\w.-]+?@\w+?\.\w{2,4}\b', '<email>', comment)
    # Mask links/URLs with <link> (handling various URL formats)
    comment = re.sub(r'https?://\S+|www\.\S+', '<link>', comment)
    return comment

In [29]:
for datum in deduplicated_data:
    datum['text'] = mask_reddit_comments(datum['text'])

In [30]:
with open("/mnt/block-volume/root/reddit_indonesia_sarcastic/raw_data/reddit_indonesia_sarcastic.json", "w") as f:
    json.dump(deduplicated_data, f)

In [31]:
sarcastic = [d for d in deduplicated_data if d['label'] == 1]
non_sarcastic = [d for d in deduplicated_data if d['label'] == 0]

In [32]:
import random

random.seed(41)
sampled_non_sarcastic = random.sample(non_sarcastic, k=len(sarcastic) * 3)

In [35]:
balanced_data = sampled_non_sarcastic + sarcastic

In [41]:
from sklearn.model_selection import train_test_split

train_data, test_val_data = train_test_split(balanced_data, train_size=0.7, random_state=41, stratify=[d['label'] for d in balanced_data])
val_data, test_data = train_test_split(test_val_data, test_size=(2/3), random_state=41, stratify=[d['label'] for d in test_val_data])

In [42]:
len(train_data), len(test_data), len(val_data)

(9881, 2824, 1411)

In [44]:
with open("/mnt/block-volume/root/reddit_indonesia_sarcastic/data/train.json", "w") as f:
    json.dump(train_data, f)

with open("/mnt/block-volume/root/reddit_indonesia_sarcastic/data/test.json", "w") as f:
    json.dump(test_data, f)

with open("/mnt/block-volume/root/reddit_indonesia_sarcastic/data/validation.json", "w") as f:
    json.dump(val_data, f)