# Dataset Sushil
###### src="@INPROCEEDINGS{10455077, author={Dalavi, Sushil and Nivelkar, Tanvesh and Patil, Sarvesh and Sawant, Aadesh and Vanwari, Pankaj}, booktitle={2023 6th International Conference on Advances in Science and Technology (ICAST)}, title={Enhancing Hate Speech Detection through Emoji-based Classification using Bi-LSTM and GloVe Embeddings}, year={2023}, volume={}, number={}, pages={506-511}, keywords={Deep learning;Video on demand;Social networking (online);Hate speech;Web sites;Task analysis;Emojis;Emoji-based hate speech model;TfidfVectorizer;GloVe;Bi-LSTM;networking;abusive}, doi={10.1109/ICAST59062.2023.10455077}}"

## Import Libraries

In [None]:
import pandas as pd
import html
import emoji
from collections import Counter
import warnings

In [2]:
pd.set_option('display.max_colwidth', None)
warnings.simplefilter(action='ignore', category=FutureWarning)

## Utility Functions

In [3]:
def contains_emoji(text):
    return isinstance(text, str) and bool(emoji.emoji_list(text))

def extract_emojis(text):
    return [e["emoji"] for e in emoji.emoji_list(text)]

def format_emojis(emojis):
    return " ".join(emojis) if isinstance(emojis, list) else emojis

def get_most_least_common_emojis(df, emoji_column="emoji", top_n=1):
    all_emojis = [e for em in df[emoji_column] for e in em.split()]
    counts = Counter(all_emojis)
    return counts.most_common(top_n), counts.most_common()[-top_n:]

def get_most_common_emojis_by_label(df, emoji_column="emoji", label_column="labels", top_n=10):
    result = {}
    for label in df[label_column].unique():
        emojis = [e for em in df[df[label_column] == label][emoji_column] for e in em.split()]
        result[label] = Counter(emojis).most_common(top_n)
    return result

## Main Processing Function

In [4]:
def process_sushil_dataset(file1, file2, output_csv="dataset/Sushil/train_emoji.csv"):
    # Load and clean Dataset 1
    df1 = pd.read_excel(file1, header=None, names=["tweets", "labels"])
    df1["tweets"] = df1["tweets"].apply(html.unescape)
    df1["labels"] = df1["labels"].replace({'normal': 0, 'hateful': 1})

    # Load and relabel Dataset 2
    df2 = pd.read_excel(file2)
    df2["labels"] = df2["labels"].replace({1: 0, 0: 1})

    # Combine
    df = pd.concat([df1, df2], ignore_index=True)

    # Emoji filtering
    df = df[df["tweets"].apply(contains_emoji)].copy()
    df["emoji"] = df["tweets"].apply(extract_emojis).apply(format_emojis)

    # Save cleaned CSV
    df.to_csv(output_csv, index=False)
    print(f"Saved emoji-processed dataset to: {output_csv}")

    # Emoji analysis
    most_common, least_common = get_most_least_common_emojis(df)
    print("Most common emoji:", most_common)
    print("Least common emoji:", least_common)

    emoji_by_label = get_most_common_emojis_by_label(df)
    for label, items in emoji_by_label.items():
        print(f"\nTop emojis for label {label}: {items}")

    return df

## Train and Test Processing

In [6]:
# === Run ===
df_emoji = process_sushil_dataset("dataset/Sushil/Dataset 1.xlsx", "dataset/Sushil/Dataset 2.xlsx", "dataset/Sushil/train_emoji.csv")

Saved emoji-processed dataset to: dataset/Sushil/train_emoji.csv
Most common emoji: [('😂', 6869)]
Least common emoji: [('▪', 1)]

Top emojis for label 0: [('😭', 3851), ('😂', 3586), ('😍', 1302), ('😊', 1176), ('🔥', 1128), ('😱', 682), ('❤', 598), ('😘', 528), ('😢', 527), ('😩', 502)]

Top emojis for label 1: [('😂', 3283), ('😭', 1298), ('😍', 431), ('😩', 417), ('🔥', 375), ('🙄', 308), ('💀', 279), ('👅', 251), ('💯', 222), ('😡', 203)]
