# Dataset TweetEval
###### src="@inproceedings{barbieri2020tweeteval, title={{TweetEval:Unified Benchmark and Comparative Evaluation for Tweet Classification}}, author={Barbieri, Francesco and Camacho-Collados, Jose and Espinosa-Anke, Luis and Neves, Leonardo}, booktitle={Proceedings of Findings of EMNLP}, year={2020}}"

## Import Libraries

In [1]:
import pandas as pd
import html
import emoji
from collections import Counter
import warnings

In [2]:
pd.set_option('display.max_colwidth', None)
warnings.simplefilter(action='ignore', category=FutureWarning)

## Utility Functions

In [3]:
def contains_emoji(text):
    return isinstance(text, str) and bool(emoji.emoji_list(text))

def extract_emojis(text):
    return [e["emoji"] for e in emoji.emoji_list(text)]

def format_emojis(emojis):
    return " ".join(emojis) if isinstance(emojis, list) else emojis

def get_most_least_common_emojis(df, emoji_column="emoji", top_n=1):
    all_emojis = [e for em in df[emoji_column] for e in em.split()]
    counts = Counter(all_emojis)
    return counts.most_common(top_n), counts.most_common()[-top_n:]

def get_most_common_emojis_by_label(df, emoji_column="emoji", label_column="label", top_n=10):
    result = {}
    for label in df[label_column].unique():
        emojis = [e for em in df[df[label_column] == label][emoji_column] for e in em.split()]
        result[label] = Counter(emojis).most_common(top_n)
    return result

def merge_text_label(text_file, label_file, output_csv):
    with open(text_file, "r", encoding="utf-8") as f:
        texts = f.readlines()
    with open(label_file, "r", encoding="utf-8") as f:
        labels = f.readlines()
    if len(texts) != len(labels):
        print(f"Mismatch: {text_file} vs {label_file}")
        return
    df = pd.DataFrame({"text": [t.strip() for t in texts], "label": [l.strip() for l in labels]})
    df.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")

## Main Processing Function

In [4]:
def process_tweeteval_emoji(df, save_path="dataset/TwEv/output.csv"):
    df = df[df["text"].apply(contains_emoji)].copy()
    df["emoji"] = df["text"].astype(str).apply(extract_emojis).apply(format_emojis)
    
    # Save processed data
    df.to_csv(save_path, index=False)
    
    # Analysis output
    most_common, least_common = get_most_least_common_emojis(df)
    print("Most common emoji:", most_common)
    print("Least common emoji:", least_common)
    
    emoji_by_label = get_most_common_emojis_by_label(df)
    for label, items in emoji_by_label.items():
        print(f"\nTop emojis for label {label}: {items}")
    
    return df

## Train and Test Processing

In [5]:
# === Step 1: Merge TXT to CSV (only run once if needed) ===
merge_text_label("dataset/TwEv/train_text.txt", "dataset/TwEv/train_labels.txt", "dataset/TwEv/train.csv")
merge_text_label("dataset/TwEv/val_text.txt", "dataset/TwEv/val_labels.txt", "dataset/TwEv/val.csv")
merge_text_label("dataset/TwEv/test_text.txt", "dataset/TwEv/test_labels.txt", "dataset/TwEv/test.csv")

Saved to dataset/TwEv/train.csv
Saved to dataset/TwEv/val.csv
Saved to dataset/TwEv/test.csv


In [6]:
train_df = pd.read_csv("dataset/TwEv/train.csv")
test_df = pd.read_csv("dataset/TwEv/test.csv")
val_df = pd.read_csv("dataset/TwEv/val.csv")

In [7]:
train_emoji_df = process_tweeteval_emoji(train_df, "dataset/TwEv/train_emoji.csv")

Most common emoji: [('😂', 241)]
Least common emoji: [('💞', 1)]

Top emojis for label 1: [('😂', 127), ('😭', 30), ('🔨', 22), ('💦', 20), ('💯', 18), ('😘', 15), ('👏', 11), ('🤬', 9), ('🙄', 8), ('🤣', 8)]

Top emojis for label 0: [('😂', 114), ('😭', 32), ('😘', 30), ('🙃', 25), ('😊', 19), ('©', 19), ('🤔', 16), ('🙄', 11), ('👏🏾', 10), ('😎', 10)]


In [8]:
val_emoji_df = process_tweeteval_emoji(val_df, "dataset/TwEv/val_emoji.csv")

Most common emoji: [('😊', 8)]
Least common emoji: [('😭', 1)]

Top emojis for label 0: [('😊', 3), ('🤣', 2), ('®', 2), ('😒', 2), ('😳', 1), ('👇', 1), ('😂', 1), ('😠', 1), ('❤️', 1), ('📚', 1)]

Top emojis for label 1: [('😊', 5), ('🇺🇸', 5), ('™', 3), ('➡️', 1), ('‼️', 1)]


In [9]:
test_emoji_df = process_tweeteval_emoji(test_df, "dataset/TwEv/test_emoji.csv")

Most common emoji: [('😂', 262)]
Least common emoji: [('💩', 1)]

Top emojis for label 1: [('😂', 136), ('😭', 19), ('💯', 15), ('🙄', 15), ('😒', 12), ('🖕', 12), ('😘', 10), ('🤔', 8), ('😇', 7), ('🙃', 6)]

Top emojis for label 0: [('😂', 126), ('😍', 16), ('😭', 14), ('❤️', 11), ('🙄', 8), ('💯', 8), ('💖', 8), ('😘', 7), ('👀', 7), ('💀', 6)]
