In [1]:
import pandas as pd
from transformers import BertTokenizer#, BertForSequenceClassification, AdamW
#from torch.utils.data import DataLoader, Dataset
#import torch
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./Sarcasm_on_Reddit/train-balanced-sarcasm.csv')[['comment', 'parent_comment', 'label']]

#Data preprocessing to avoid further problems
df['comment'] = df['comment'].astype(str)
df['parent_comment'] = df['parent_comment'].astype(str)
df.dropna(inplace=True)
df['label'] = df['label'].astype(int)
df = df[df['label'].isin([0, 1])]

#df['combined'] = list(zip(df['parent_comment'], df['comment']))
df['combined'] = df['parent_comment'] + " [SEP] " + df['comment']

df_label_0 = df[df['label'] == 0].head(10000)

# Filter the DataFrame for label = 1 and take the first 5000 entries
df_label_1 = df[df['label'] == 1].head(10000)

# Concatenate the two DataFrames
df = pd.concat([df_label_0, df_label_1], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
#Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 787143.68B/s]


In [6]:
# Analyze the tokenized text to find [UNK] tokens
def analyze_tokenized_texts(texts):
    unk_count = 0
    total_count = 0
    for text in texts:
        tokens = tokenizer.tokenize(text)
        unk_count += tokens.count('[UNK]')
        total_count += len(tokens)
    return unk_count, total_count


In [17]:
# Analyze both sarcastic and non-sarcastic comments
sarcastic_texts = df[df['label'] == 1]['combined']
non_sarcastic_texts = df[df['label'] == 0]['combined']

sarcastic_unk_count, sarcastic_total_count = analyze_tokenized_texts(sarcastic_texts)
non_sarcastic_unk_count, non_sarcastic_total_count = analyze_tokenized_texts(non_sarcastic_texts)

# Calculate the proportion of [UNK] tokens to total tokens for both sarcastic and non-sarcastic comments
sarcastic_unk_proportion = sarcastic_unk_count / sarcastic_total_count
non_sarcastic_unk_proportion = non_sarcastic_unk_count / non_sarcastic_total_count

print(f"Sarcastic UNK proportion: {sarcastic_unk_proportion}")
print(f"Non-sarcastic UNK proportion: {non_sarcastic_unk_proportion}")

Sarcastic UNK proportion: 8.887802601904211e-06
Non-sarcastic UNK proportion: 1.2791049676173258e-05


In [15]:
def unk_comment_proportion(texts):
    unk_comment_count = 0
    total_comments = len(texts)
    for text in texts:
        tokens = tokenizer.tokenize(text)
        if '[UNK]' in tokens:
            unk_comment_count += 1
    return unk_comment_count / total_comments

# Analyze both sarcastic and non-sarcastic comments
sarcastic_texts = df[df['label'] == 1]['combined']
non_sarcastic_texts = df[df['label'] == 0]['combined']

sarcastic_unk_comment_proportion = unk_comment_proportion(sarcastic_texts)
non_sarcastic_unk_comment_proportion = unk_comment_proportion(non_sarcastic_texts)

print(f"Sarcastic UNK comment proportion: {sarcastic_unk_comment_proportion:.2%}")
print(f"Non-sarcastic UNK comment proportion: {non_sarcastic_unk_comment_proportion:.2%}")

Sarcastic UNK comment proportion: 0.04%
Non-sarcastic UNK comment proportion: 0.05%


## 