In [3]:
import json

with open("/root/reddit_comments_subreddit_indonesia_RC_2023-01-09.json") as f:
    raw_data = json.load(f)

In [4]:
len(raw_data)

929182

In [13]:
columns = ["author", "created_utc", "score", "permalink", "subreddit"]

In [14]:
sentences_data = []

for datum in raw_data:
    # split multi-sentence
    body = datum["body"]
    sentences = body.split("\n")
    for sentence in sentences:
        obj = {k: datum[k] for k in columns}
        text = sentence.strip()
        if len(text) > 0:
            obj['body'] = text
            sentences_data.append(obj)

In [23]:
from ftlangdetect import detect
from tqdm.auto import tqdm

valid_langs = ["id", "jv", "min", "ms", "su"]
indonesia_data = []

for datum in tqdm(sentences_data):
    lang = detect(datum["body"])['lang']
    if lang in valid_langs:
        datum['lang_fastText'] = lang
        indonesia_data.append(datum)

100%|██████████| 1416946/1416946 [00:25<00:00, 55103.33it/s]


In [24]:
len(indonesia_data)

784942

In [34]:
import re

def has_sarcasm_tag(text: str) -> bool:
    # ends with either one of these sarcasm tags
    pattern = r'(?<!\S)(?:/s|//s|/sarcasm|//sarcasm|\\s|\\\\s|\\sarcasm|\\\\sarcasm)$'
    matches = re.findall(pattern, text)
    return matches if matches else None

In [49]:
cleaned_data = []

for datum in indonesia_data:
    body = datum["body"]
    tags = has_sarcasm_tag(body)
    if tags:
        # make `text` column clean; remove sarcasm tags
        for tag in tags:
            body = body.replace(tag, "")

    datum["label"] = 1 if tags else 0
    datum["text"] = body.strip()
    cleaned_data.append(datum)

In [60]:
import pandas as pd

df = pd.DataFrame(cleaned_data)
df['created_utc'] = df['created_utc'].astype('int')

In [63]:
from collections import Counter

Counter(df['label'])

Counter({0: 783944, 1: 998})

In [64]:
from lsh import minhash, cache

# use minHash LSH algorithm to find near duplicates
hasher = minhash.MinHasher(seeds=100, char_ngram=4, hashbytes=8, random_state=42)
lsh_cache = cache.Cache(num_bands=20, hasher=hasher)
neardup_ids = []

# hash every text
for idx, text in enumerate(tqdm(df['text'])):
    lsh_cache.add_fingerprint(hasher.fingerprint(text), idx)

# find bins of duplicates
for cache_bin in lsh_cache.bins:
    for bucket_id in cache_bin:
        if len(cache_bin[bucket_id]) > 1:
            # add ids of neardup texts
            neardup_ids.append(cache_bin[bucket_id])

100%|██████████| 784942/784942 [01:56<00:00, 6746.77it/s]


In [65]:
sorted_sets = sorted(neardup_ids, key=lambda x: min(x))

merged_sets = []
current_merged_set = sorted_sets[0]

for s in sorted_sets[1:]:
    # if has overlapping element
    if any(x in current_merged_set for x in s):
        current_merged_set.update(s)  # merge overlapping sets
    else:
        merged_sets.append(current_merged_set)
        current_merged_set = s  # start a new merged set

# add last set
merged_sets.append(current_merged_set)

In [66]:
drop_ids = set()

# for each "cluster", only keep first and drop the rest
for cluster in merged_sets:
    drop_ids |= set(list(cluster)[1:])

In [67]:
df = df[df.apply(lambda row: row.name not in drop_ids, axis=1)]

In [68]:
Counter(df['label'])

Counter({0: 654056, 1: 845})

In [71]:
import re

def mask_reddit_comments(comment):
    # Mask usernames with <username>
    comment = re.sub(r'/u/[\w]+', '<username>', comment)
    # Mask hashtags with <hashtag>
    comment = re.sub(r'#[\w]+', '<hashtag>', comment)
    # Mask email addresses with <email>
    comment = re.sub(r'\b[\w.-]+?@\w+?\.\w{2,4}\b', '<email>', comment)
    # Mask links/URLs with <link> (handling various URL formats)
    comment = re.sub(r'https?://\S+|www\.\S+', '<link>', comment)
    return comment

In [72]:
df["text"] = df["text"].apply(mask_reddit_comments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].apply(mask_reddit_comments)


In [75]:
df.to_csv("/root/reddit_indonesia_sarcastic.csv", index=False)

In [95]:
# tmp = df[df['label'] == 1]
# tmp[['text', 'label']].head(n=50).to_html('/root/temp.html', index=False)