# Dataset OLID
###### src="https://huggingface.co/datasets/christophsonntag/OLID/embed/viewer/1.0.0/train"

## Import Libraries

In [1]:
import pandas as pd
import html
import emoji
from collections import Counter
import warnings

In [2]:
pd.set_option('display.max_colwidth', None)
warnings.simplefilter(action='ignore', category=FutureWarning)

## Utility Functions

In [3]:
def contains_emoji(text):
    return isinstance(text, str) and bool(emoji.emoji_list(text))

def extract_emojis(text):
    return[e["emoji"] for e in emoji.emoji_list(text)]

def format_emojis(emojis):
    return " ".join(emojis) if isinstance(emojis, list) else emojis

def get_most_least_common_emojis(df, emoji_column="emoji", label_column="label", top_n=1):
    all_emojis = [e for em_list in df[emoji_column] for e in em_list.split()]
    counts = Counter(all_emojis)
    return counts.most_common(top_n), counts.most_common()[-top_n:]

def get_most_common_emojis_by_label(df, emoji_column="emoji", label_column="label", top_n=10):
    result = {}
    for label in df[label_column].unique():
        emojis = [e for text in df[df[label_column] == label][emoji_column] for e in text.split()]
        result[label] = Counter(emojis).most_common(top_n)
    return result

## Main Processing Function

In [None]:
def process_olid_data(file_path, save_path="dataset-english/OLID/output.csv"):
    df = pd.read_csv(file_path, index_col=0)
    df["tweet"] = df["tweet"].apply(html.unescape)
    
    # Select relevant rows
    df_tin = df[df["subtask_b"] == 'TIN']
    df_not = df[df["subtask_a"] == 'NOT']
    
    # Drop unused columns
    df_tin = df_tin.drop(columns=['cleaned_tweet', 'subtask_b', 'subtask_c'])
    df_not = df_not.drop(columns=['cleaned_tweet', 'subtask_b', 'subtask_c'])
    
    # Keep only rows with emojis
    df_tin = df_tin[df_tin["tweet"].apply(contains_emoji)]
    df_not = df_not[df_not["tweet"].apply(contains_emoji)]
    
    # Merge and label
    df = pd.concat([df_tin, df_not], ignore_index=True)
    df["label"] = df["subtask_a"].replace({'OFF':1, 'NOT':0})
    df = df.drop(columns=["subtask_a"])
    
    # Emoji processing
    df["emoji"] = df["tweet"].apply(extract_emojis).apply(format_emojis)
    
    # Save processed data
    df.to_csv(save_path, index=False)
    
    # Analysis output
    most_common, least_common = get_most_least_common_emojis(df, "emoji", top_n=1)
    print(f"Most common emoji: {most_common}")
    print(f"Least common emoji: {least_common}")
    
    emoji_by_label = get_most_common_emojis_by_label(df, "emoji", "label", top_n=10)
    for label, items in emoji_by_label.items():
        print(f"\nTop emojis for label {label}: {items}")
    
    return df

## Train and Test Processing

In [None]:
train_df = process_olid_data("dataset-english/OLID/train.csv","dataset-english/OLID/train_emoji.csv")

Most common emoji: [('😂', 516)]
Least common emoji: [('🚨', 1)]

Top emojis for label 1: [('😂', 139), ('🇺🇸', 76), ('🤣', 64), ('😭', 31), ('😡', 16), ('🤔', 15), ('👏', 15), ('😆', 14), ('🙄', 13), ('👇', 13)]

Top emojis for label 0: [('😂', 377), ('🇺🇸', 291), ('🤣', 137), ('❤️', 123), ('😍', 73), ('😭', 68), ('👍', 65), ('🤔', 62), ('😘', 44), ('🙄', 40)]


In [None]:
test_df = process_olid_data("dataset-english/OLID/test.csv","dataset-english/OLID/test_emoji.csv")

Most common emoji: [('😂', 21)]
Least common emoji: [('🙌', 1)]

Top emojis for label 1: [('🇺🇸', 7), ('😂', 5), ('🖕', 5), ('🌻', 3), ('❤️', 2), ('🤔', 2), ('😭', 2), ('👇', 2), ('🔥', 1), ('🤷🏽\u200d♂️', 1)]

Top emojis for label 0: [('😂', 16), ('❤️', 14), ('💜', 13), ('🧡', 12), ('🇺🇸', 10), ('👏🏼', 8), ('👏', 7), ('💚', 6), ('🐅', 6), ('💛', 5)]
