In [1]:
import pandas as pd

# Load all parts
df1 = pd.read_csv("./Dataset/goemotions1.csv")
df2 = pd.read_csv("./Dataset/goemotions2.csv")
df3 = pd.read_csv("./Dataset/goemotions3.csv")

# Combine them
full_df = pd.concat([df1, df2, df3], ignore_index=True)

# Display dataset size
print(f"Total samples: {len(full_df)}")

Total samples: 211225


In [2]:
print(full_df.head())
print(full_df.columns)  # List all column names

                                                text       id  \
0                                    That game hurt.  eew5j0j   
1   >sexuality shouldn’t be a grouping category I...  eemcysk   
2     You do right, if you don't care then fuck 'em!  ed2mah1   
3                                 Man I love reddit.  eeibobj   
4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   
2             Labalool          confessions  t3_abru74  t1_ed2m7g7   
3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1                 False           0  ...     0   
1  1.548084e+09        37               

In [3]:
# Keep only relevant columns: 'text' + emotion labels
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Select only 'text' and emotion labels
clean_df = full_df[['text'] + emotion_labels]

# Save the cleaned dataset
clean_df.to_csv("goemotions_cleaned.csv", index=False)

# Display cleaned dataset info
print(clean_df.head())
print(f"Total samples after cleaning: {len(clean_df)}")

                                                text  admiration  amusement  \
0                                    That game hurt.           0          0   
1   >sexuality shouldn’t be a grouping category I...           0          0   
2     You do right, if you don't care then fuck 'em!           0          0   
3                                 Man I love reddit.           0          0   
4  [NAME] was nowhere near them, he was by the Fa...           0          0   

   anger  annoyance  approval  caring  confusion  curiosity  desire  ...  \
0      0          0         0       0          0          0       0  ...   
1      0          0         0       0          0          0       0  ...   
2      0          0         0       0          0          0       0  ...   
3      0          0         0       0          0          0       0  ...   
4      0          0         0       0          0          0       0  ...   

   love  nervousness  optimism  pride  realization  relief  remorse 

In [4]:
emotion_counts = clean_df.iloc[:, 1:].sum(axis=1)

# Find rows where more than 2 emotions are marked
more_than_2_emotions = (emotion_counts > 2).sum()

print(f"Total rows with more than 2 emotions: {more_than_2_emotions}")


Total rows with more than 2 emotions: 4807


In [5]:
# Remove rows where more than 2 emotions are marked
df = clean_df[emotion_counts <= 2].reset_index(drop=True)

# Save the final cleaned dataset
df.to_csv("goemotions_filtered.csv", index=False)

# Print updated dataset info
print(f"Total samples after removing multi-emotion rows: {len(df)}")

Total samples after removing multi-emotion rows: 206418


In [6]:
emotion_counts = df.iloc[:, 1:].sum(axis=1)

# Find rows where more than 2 emotions are marked
more_than_2_emotions = (emotion_counts > 2).sum()

print(f"Total rows with more than 2 emotions: {more_than_2_emotions}")


Total rows with more than 2 emotions: 0


In [7]:
# Convert multi-label emotions into a single "emotion" column
df['emotion'] = df[emotion_labels].idxmax(axis=1)

# Keep only 'text' and 'emotion' columns
df = df[['text', 'emotion']]

# Save the final dataset
df.to_csv("goemotions_final.csv", index=False)

# Print sample rows
print(df.head())

                                                text     emotion
0                                    That game hurt.     sadness
1   >sexuality shouldn’t be a grouping category I...  admiration
2     You do right, if you don't care then fuck 'em!     neutral
3                                 Man I love reddit.        love
4  [NAME] was nowhere near them, he was by the Fa...     neutral


In [8]:
import re
from collections import Counter

# 📌 Step 1: Text Cleaning (Keep Important Punctuation)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9!?.,\s]', '', text)  # Keep only letters, numbers, ! ? . ,
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()

# Apply cleaning
df['text'] = df['text'].apply(clean_text)

# 📌 Step 2: Tokenization (Splitting words)
df['tokens'] = df['text'].apply(lambda x: x.split())

# 📌 Step 3: Build Vocabulary
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)
vocab = {word: idx+1 for idx, (word, _) in enumerate(word_freq.most_common())}

# 📌 Step 4: Convert Tokens to Indexes
df['token_ids'] = df['tokens'].apply(lambda tokens: [vocab[word] for word in tokens if word in vocab])

# Print sample
print(df[['text', 'tokens', 'token_ids']].head())

                                                text  \
0                                    that game hurt.   
1  sexuality shouldnt be a grouping category it m...   
2       you do right, if you dont care then fuck em!   
3                                 man i love reddit.   
4  name was nowhere near them, he was by the falcon.   

                                              tokens  \
0                                [that, game, hurt.]   
1  [sexuality, shouldnt, be, a, grouping, categor...   
2  [you, do, right,, if, you, dont, care, then, f...   
3                            [man, i, love, reddit.]   
4  [name, was, nowhere, near, them,, he, was, by,...   

                                           token_ids  
0                                     [8, 172, 3563]  
1  [5344, 597, 19, 4, 8416, 10137, 10, 138, 6, 33...  
2      [6, 44, 868, 32, 6, 33, 321, 109, 207, 12045]  
3                                 [175, 2, 48, 1216]  
4    [11, 15, 3217, 889, 1106, 31, 15, 85, 1, 31994

In [9]:
import numpy as np

# Compute the 95th percentile sequence length
sequence_lengths = [len(seq) for seq in df['token_ids'].tolist()]
max_length = int(np.percentile(sequence_lengths, 95))

print(f"95th percentile max length: {max_length}")

95th percentile max length: 24


In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Apply padding (truncate longer sequences, pad shorter ones)
tokenized_padded = pad_sequences(df['token_ids'].tolist(), 
                                 maxlen=max_length, 
                                 padding='post',  # Pad at the end
                                 truncating='post')  # Truncate at the end

# Convert labels to NumPy array
labels = np.array(df['emotion'].tolist())