In [1]:
import pandas as pd

df = pd.read_csv("../data/twitter_indonesia_sarcastic/raw_data/khotijah.csv")
df.columns = ["tweet", "label"]
# un-reverse
df["tweet"] = df["tweet"].apply(lambda x: " ".join(x.split()[::-1]))
df

Unnamed: 0,tweet,label
0,Bob Sadino bilang orang goblog cenderung lebih...,0
1,Sekolahan itu kek mantan makin oke semenjak di...,0
2,Bagus jg buat naikin standard amplop jelang 17...,0
3,Ada apa dg demokrasi Indonesia !,0
4,Bapak Prabowo ibu Jokowi . Santai .. :) #debat...,0
...,...,...
17713,AKU BANGGA KARNA GUBERNUR KU BELIAUUUU. tetap ...,1
17714,Terima Kasih Pak @aniesbaswedan #4niesTenggela...,1
17715,"Keren ne, Gub Formula E-mpang Wajah tersenyum ...",1
17716,Banjir jakarta bukan salah @aniesbaswedan tapi...,1


In [2]:
from collections import Counter

Counter(df["label"])

Counter({0: 13368, 1: 4350})

In [3]:
from lsh import minhash, cache

# use minHash LSH algorithm to find near duplicates
hasher = minhash.MinHasher(seeds=100, char_ngram=4, hashbytes=8, random_state=42)
lsh_cache = cache.Cache(num_bands=20, hasher=hasher)
neardup_ids = []

# hash every tweet
for idx, text in enumerate(df["tweet"]):
    lsh_cache.add_fingerprint(hasher.fingerprint(text), idx)

# find bins of duplicates
for cache_bin in lsh_cache.bins:
    for bucket_id in cache_bin:
        if len(cache_bin[bucket_id]) > 1:
            # add ids of neardup texts
            neardup_ids.append(cache_bin[bucket_id])

In [4]:
sorted_sets = sorted(neardup_ids, key=lambda x: min(x))

merged_sets = []
current_merged_set = sorted_sets[0]

for s in sorted_sets[1:]:
    # if has overlapping element
    if any(x in current_merged_set for x in s):
        current_merged_set.update(s)  # merge overlapping sets
    else:
        merged_sets.append(current_merged_set)
        current_merged_set = s  # start a new merged set

# add last set
merged_sets.append(current_merged_set)

In [5]:
drop_ids = set()

# for each "cluster", only keep first and drop the rest
for cluster in merged_sets:
    drop_ids |= set(list(cluster)[1:])

In [6]:
df = df[df.apply(lambda row: row.name not in drop_ids, axis=1)]

In [7]:
Counter(df["label"])

Counter({0: 12190, 1: 671})

In [8]:
import re

def mask_tweets(tweet):
    # Mask usernames with <username>
    tweet = re.sub(r'@[\w]+', '<username>', tweet)
    # Mask hashtags with <hashtag>
    tweet = re.sub(r'#[\w]+', '<hashtag>', tweet)
    # Mask email addresses with <email>
    tweet = re.sub(r'\b[\w.-]+?@\w+?\.\w{2,4}\b', '<email>', tweet)
    # Mask links/URLs with <link> (handling various URL formats)
    tweet = re.sub(r'https?://\S+|www\.\S+', '<link>', tweet)
    return tweet

In [9]:
df["tweet"] = df["tweet"].apply(mask_tweets)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tweet"] = df["tweet"].apply(mask_tweets)


In [23]:
df.to_csv("../data/twitter_indonesia_sarcastic/raw_data/khotijah_cleaned.csv", index=False)

In [12]:
sarcastic = df[df['label'] == 1]
non_sarcastic = df[df['label'] == 0]

In [14]:
sampled_non_sarcastic = non_sarcastic.sample(n=len(sarcastic) * 3, random_state=41)

In [17]:
balanced_df = pd.concat([sampled_non_sarcastic, sarcastic])

In [18]:
from sklearn.model_selection import train_test_split

train_df, test_val_df = train_test_split(balanced_df, train_size=0.7, random_state=41, stratify=balanced_df['label'])
val_df, test_df = train_test_split(test_val_df, test_size=(2/3), random_state=41, stratify=test_val_df['label'])

In [20]:
len(train_df), len(test_df), len(val_df)

(1878, 538, 268)

In [22]:
train_df.to_csv("../data/twitter_indonesia_sarcastic/data/train.csv", index=False)
test_df.to_csv("./data./twitter_indonesia_sarcastic/data/test.csv", index=False)
val_df.to_csv("./data./twitter_indonesia_sarcastic/data/validation.csv", index=False)