### Importing the dependencies

In [18]:
import os
import requests
import pandas as pd
import emoji
from rich import print
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

import ssl
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Bypass SSL verification
ssl._create_default_https_context = ssl._create_unverified_context

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

from spellchecker import SpellChecker
spell = SpellChecker()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuvrajsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuvrajsingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuvrajsingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Extracting the data

In [57]:
# List of URLs to download
urls = [
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv"
]

# Create the directory if it doesn't exist
os.makedirs("data/full_dataset", exist_ok=True)

# Download files
for url in urls:
    response = requests.get(url)
    filename = os.path.join("data/full_dataset", url.split("/")[-1])
    with open(filename, 'wb') as f:
        f.write(response.content)

    print(f"Downloaded: {filename}")


In [2]:
root_path = 'data/full_dataset/'
df = pd.DataFrame()

for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:

        # Creating the path
        file_path = os.path.join(dirpath,filename)

        # Loading the data using pandas
        raw_df = pd.read_csv(file_path)
        print(raw_df.shape)
        df = pd.concat([df, raw_df], ignore_index=True, axis=0)


In [6]:
# Saving the combines data 
df.to_csv('data/Combined_data/Raw_Data.csv',index=False)

In [3]:
# Checking the first 5 rows 
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Getting the info about the features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  

In [5]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [6]:
# Removing extra features
df.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear'],axis=1,inplace=True)

In [7]:
# Checking if any null values
df.isnull().mean()*100

text              0.0
admiration        0.0
amusement         0.0
anger             0.0
annoyance         0.0
approval          0.0
caring            0.0
confusion         0.0
curiosity         0.0
desire            0.0
disappointment    0.0
disapproval       0.0
disgust           0.0
embarrassment     0.0
excitement        0.0
fear              0.0
gratitude         0.0
grief             0.0
joy               0.0
love              0.0
nervousness       0.0
optimism          0.0
pride             0.0
realization       0.0
relief            0.0
remorse           0.0
sadness           0.0
surprise          0.0
neutral           0.0
dtype: float64

In [8]:
# Getting the frequency count for emotions



### Text pre-processing

In [22]:
def preprocess_text(text):
    """
    Preprocesses the input text by performing the following operations:
    1. Lowercasing the text.
    2. Decoding emojis to their text representation.
    3. Removing punctuation from the text.
    4. Tokenizing the text into individual words.
    5. Removing stop words from the tokens.
    6. Correcting spelling mistakes using PySpellChecker.
    7. Lemmatizing the corrected tokens.
    8. Joining the lemmatized tokens back into a single string.

    Parameters:
    text (str): The input text to preprocess.

    Returns:
    str: The preprocessed text.
    """
    # Lowercasing
    text = text.lower()

    # Emoji Decoding
    text = emoji.demojize(text)

    # Punctuation Removal
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Stop Word Removal
    tokens = [word for word in tokens if word not in stop_words]

    # Spelling Correction using PySpellChecker
    corrected_tokens = [spell.correction(word) if word not in spell.known([word]) else word for word in tokens]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in corrected_tokens]

    # Rejoin tokens into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [23]:
for i in range(len(df)):
    raw_text = df.at[i,'text']
    df.at[i,'text'] = preprocess_text(raw_text)

df.to_csv('data/Combined_data/Clean.csv')