### Importing the dependencies

In [7]:
import os
import requests
import pandas as pd
from rich import print

### Extracting the data

In [5]:
# List of URLs to download
urls = [
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv"
]

# Create the directory if it doesn't exist
os.makedirs("data/full_dataset", exist_ok=True)

# Download files
for url in urls:
    response = requests.get(url)
    filename = os.path.join("data/full_dataset", url.split("/")[-1])
    with open(filename, 'wb') as f:
        f.write(response.content)

    print(f"Downloaded: {filename}")


Downloaded: data/full_dataset/goemotions_1.csv
Downloaded: data/full_dataset/goemotions_2.csv
Downloaded: data/full_dataset/goemotions_3.csv


In [14]:
root_path = 'data/full_dataset/'
df = pd.DataFrame()

for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:

        # Creating the path
        file_path = os.path.join(dirpath,filename)

        # Loading the data using pandas
        raw_df = pd.read_csv(file_path)
        print(raw_df.shape)
        df = pd.concat([df, raw_df], ignore_index=True, axis=0)


In [15]:
# Checking the first 5 rows 
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Getting the info about the features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  

In [17]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [19]:
# Removing extra features
df.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear'],axis=1,inplace=True)

In [None]:
# Getting the frequency count for emotions



In [20]:
df

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,I would cheer hard for a Cyborg win in that to...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,"That's scalie, loli, and gmilf at the same time",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
211222,I have the worst memory ever. Idk if it's from...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,So angry. PMS?,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
