#CLEAN ISEAR DATASET

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install cleantext
# reference: https://pypi.org/project/cleantext/
!pip install nltk



In [None]:
import pandas as pd
import re
from transformers import BertTokenizer, T5Tokenizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import cleantext as clean


In [None]:
%cd /content/drive/MyDrive/NLP Project/Datasets
# you need to create shortcut of our shared folder in "Mydrive". Simply drag our folder and drop into the My drive
isear_data = pd.read_csv('ISEAR.csv')
isear_data.head() # make sure load dataset successfully.
isear_data.columns

/content/drive/.shortcut-targets-by-id/1SRt8-dPa7iCujJFQrsCYNs6JaFOHansl/NLP Project/Datasets


Index(['sentiment', 'content'], dtype='object')

In [None]:
b = isear_data['sentiment'].unique()
b # we have four sentiments for isear. -> anger, fear, joy, and sadness.

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt',
       'guit'], dtype=object)

In [None]:
# In goEmotion datset, there's label for these 4.
# '2' : anger
# '11' : disgust
# '14' : fear
# '17': joy
# '25' : sadness.
# '28' : shame,
# '29' : guilt
# To unify the format. change the isear label.

isear_data.loc[isear_data['sentiment'] == 'guit', 'sentiment'] = 'guilt' # correct typo


isear_data.loc[isear_data['sentiment'] == 'anger', 'label'] = '2'
isear_data.loc[isear_data['sentiment'] == 'disgust', 'label'] = '11'
isear_data.loc[isear_data['sentiment'] == 'fear', 'label'] = '14'
isear_data.loc[isear_data['sentiment'] == 'joy', 'label'] = '17'
isear_data.loc[isear_data['sentiment'] == 'sadness', 'label'] = '25'
isear_data.loc[isear_data['sentiment'] == 'shame', 'label'] = '28'
isear_data.loc[isear_data['sentiment'] == 'guilt', 'label'] = '29'




isear_data['label'].unique() # successfully changed


array(['17', '14', '2', '25', '11', '28', '29'], dtype=object)

In [None]:
# a = isear_data[isear_data['ID'] ==10952]
# a # as you can see there's some text that is not English.


In [None]:
# isear_data = isear_data[['ID', 'content', 'sentiment','label']]
# isear_data.head()

In [None]:
#remove the text that is not english

def remove_non_utf(text):
    ## Remove all mentions (e.g., @username)
    text = re.sub(r'@\w+', '', text)  # remove @mentions

    # Remove HTML entities (like &lt;, &gt;, etc.)
    text = re.sub(r'&\w+;', '', text)

    # Remove any non-ASCII characters (e.g., strange symbols like ðŸ)
    text = text.encode("ascii", "ignore").decode()

    # Remove only the parentheses, but keep the content inside them
    text = text.replace('(', '').replace(')', '')

    # Remove hashtags (e.g., #Scorpio) but keep the rest of the sentence intact
    text = re.sub(r'#\w+', '', text)

    # Generalize contraction removal (e.g., I've, you're, it's, can't, don't)
    text = re.sub(r"\b(?:[A-Za-z]+['’][a-z]+)\b", '', text)

    # Remove all non-alphanumeric characters
    text = re.sub(r"[^\w\s']", '', text)

    # Remove extra whitespace and return
    return ' '.join(text.split())



# isear_data['content'] = isear_data['content'].apply(remove_non_utf)

# print(isear_data.head())

In [None]:
# # Function to remove emojis from text
# def remove_emojis(text):
#     # Define the pattern for emojis using regex
#     emoji_pattern = re.compile(
#         "["
#         "\U0001F600-\U0001F64F"  # emoticons
#         "\U0001F300-\U0001F5FF"  # symbols & pictographs
#         "\U0001F680-\U0001F6FF"  # transport & map symbols
#         "\U0001F1E0-\U0001F1FF"  # flags (iOS)
#         "\U00002702-\U000027B0"  # miscellaneous symbols
#         "\U000024C2-\U0001F251"  # enclosed characters
#         "]+", flags=re.UNICODE)

#     # Substitute the emojis in the text
#     return emoji_pattern.sub(r'', text)

In [None]:
# Download NLTK data (stopwords and punkt tokenizer)
nltk.download('stopwords')
nltk.download('punkt')

# Load stopwords (if needed)
stop_words = set(stopwords.words('english'))

# Assuming 'isear_data' is your DataFrame containing the dataset

def clean_dataset(dataset):
    cleaned_data = []
    count = 0
    # dataset['content'] = dataset['content'].astype(str).apply(remove_non_utf)


    # Iterate through each row in the DataFrame
    for index, row in dataset.iterrows():  # Change content.iterrows() to isear_data.iterrows()
        problem = str(row['content'])  # Get the text content for each row

        if not problem.strip():
            print(f"Skipping empty content in row {index}")
            count += 1
            continue


        # problem = remove_emojis(problem)

        # Clean the text using 'cleantext'
        cleaned_problem = clean.clean(problem,
          clean_all=False,          # Execute all cleaning operations
          extra_spaces=True,       # Remove extra white spaces
          stemming=False,           # Stem the words , got problem
          stopwords=True,          # Remove stop words
          lowercase=True,          # Convert to lowercase
          numbers=True,            # Remove all digits
          punct=True,              # Remove all punctuation
          stp_lang='english'       # Language for stop words
        )

        cleaned_data.append({
            # 'ID': row['ID'],                 # Keep the original ID
            'content': cleaned_problem,# Save tokenized content
            'sentiment': row['sentiment'], # Keep the original sentiment
            'label' : row['label']
        })

    print(f"{count} rows were skipped due to they are empty")

        # You can now perform any further operations on 'tokenized_content'

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df = cleaned_df[cleaned_df['content'].map(lambda x: len(x) > 0)]
    return cleaned_df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
cleaned_isear = clean_dataset(isear_data)
cleaned_isear_data = cleaned_isear.drop_duplicates(subset=['content'])
# cleaned_iseard_data = cleaned_isear_data[['ID', 'content', 'sentiment', 'label']]
cleaned_isear_data.to_csv('cleaned_full_isear_data_with_sentiment.csv', index=False)
print("Cleaned data with sentiment saved to 'cleaned_isear_data_with_sentiment.csv'")

0 rows were skipped due to they are empty
Cleaned data with sentiment saved to 'cleaned_isear_data_with_sentiment.csv'


In [None]:
print(cleaned_isear_data.head())

                                             content sentiment label
0  days feel close partner friends feel peace als...       joy    17
1  every time imagine someone love could contact ...      fear    14
2  obviously unjustly treated possibility elucida...     anger     2
3  think short time live relate periods life thin...   sadness    25
4  gathering found involuntarily sitting next two...   disgust    11


#Clean GoEmotion

In [None]:
goEmo_dataset_full = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/raw/train-00000-of-00001.parquet")
goEmo_dataset_full.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# let's change the dataset format into text, id, and label

emotion_columns = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
                   'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
                   'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
                   'relief', 'remorse', 'sadness', 'surprise', 'neutral']

def map_emotion_lable(row):
  for emotion in emotion_columns:
    if row[emotion] == 1:
      return emotion
  return 'neutral'


goEmo_dataset_full['sentiment'] = goEmo_dataset_full.apply(map_emotion_lable, axis=1)
goEmo_df = goEmo_dataset_full[['text', 'id', 'sentiment']]
goEmo_df.head()


Unnamed: 0,text,id,sentiment
0,That game hurt.,eew5j0j,sadness
1,>sexuality shouldn’t be a grouping category I...,eemcysk,neutral
2,"You do right, if you don't care then fuck 'em!",ed2mah1,neutral
3,Man I love reddit.,eeibobj,love
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,neutral


In [None]:
# Define the mapping for emotions to numeric labels in string format
emotion_mapping = {
    'admiration': '0',
    'amusement': '1',
    'anger': '2',
    'annoyance': '3',
    'approval': '4',
    'caring': '5',
    'confusion': '6',
    'curiosity': '7',
    'desire': '8',
    'disappointment': '9',
    'disapproval': '10',
    'disgust': '11',
    'embarrassment': '12',
    'excitement': '13',
    'fear': '14',
    'gratitude': '15',
    'grief': '16',
    'joy': '17',
    'love': '18',
    'nervousness': '19',
    'optimism': '20',
    'pride': '21',
    'realization': '22',
    'relief': '23',
    'remorse': '24',
    'sadness': '25',
    'surprise': '26',
    'neutral': '27'
}

# Apply the mapping to the 'label' column in your DataFrame
goEmo_df.loc[:, 'label'] = goEmo_df['sentiment'].map(emotion_mapping)
goEmo_df = goEmo_df.rename(columns={'text':'content','id': 'ID'})
goEmo_df = goEmo_df[['ID', 'content', 'sentiment', 'label']]

print(goEmo_df.head())


        ID                                            content sentiment label
0  eew5j0j                                    That game hurt.   sadness    25
1  eemcysk   >sexuality shouldn’t be a grouping category I...   neutral    27
2  ed2mah1     You do right, if you don't care then fuck 'em!   neutral    27
3  eeibobj                                 Man I love reddit.      love    18
4  eda6yn6  [NAME] was nowhere near them, he was by the Fa...   neutral    27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goEmo_df.loc[:, 'label'] = goEmo_df['sentiment'].map(emotion_mapping)


In [None]:
goEmo_df['content'] = goEmo_df['content'].apply(remove_non_utf)
print(goEmo_df.head(30))


         ID                                            content  \
0   eew5j0j                                     That game hurt   
1   eemcysk  sexuality shouldnt be a grouping category It m...   
2   ed2mah1             You do right if you care then fuck 'em   
3   eeibobj                                  Man I love reddit   
4   eda6yn6    NAME was nowhere near them he was by the Falcon   
5   eespn2i  Right Considering its such an important docume...   
6   eczuekb  He as big but still quite popular heard the sa...   
7   ed5tx8y  crazy I went to a super RELIGION high school a...   
8   ef961hv                                       adorable asf   
9   edl7cr3  Sponge Blurb Pubs Quaw Haha GURR ha AAa finale...   
10  ed9w1hm  I have and now that you mention it I think wha...   
11  ee52cjs  I wanted to downvote this but not your fault h...   
12  ef7tl7i                                BUT IT'S HER TURN s   
13  ee9vw9t                                        That is odd   
14  edsqvy

In [None]:
cleaned_goEmo = clean_dataset(goEmo_df)
cleaned_goEmo.to_csv('cleaned_goEmotion_with_sentiment.csv', index=False)
print("Cleaned data with sentiment saved to 'cleaned_goEmotion_with_sentiment.csv'")

Skipping empty content in row 503
Skipping empty content in row 584
Skipping empty content in row 1320
Skipping empty content in row 4726
Skipping empty content in row 6016
Skipping empty content in row 6664
Skipping empty content in row 10657
Skipping empty content in row 16910
Skipping empty content in row 18483
Skipping empty content in row 20020
Skipping empty content in row 20785
Skipping empty content in row 21200
Skipping empty content in row 22730
Skipping empty content in row 26790
Skipping empty content in row 27168
Skipping empty content in row 33222
Skipping empty content in row 41516
Skipping empty content in row 45209
Skipping empty content in row 51501
Skipping empty content in row 52978
Skipping empty content in row 54612
Skipping empty content in row 55640
Skipping empty content in row 59690
Skipping empty content in row 63452
Skipping empty content in row 65654
Skipping empty content in row 66808
Skipping empty content in row 68713
Skipping empty content in row 69184


In [None]:
print(len(cleaned_goEmo))

210680


#Clean SemEval-18

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('/content/drive/MyDrive/NLP Project/Datasets/sem_eval_2018_task_1.py', 'subtask5.english')

# Check the dataset structure
print(dataset['train'][0])  # Print the first sample from the training set


The repository for sem_eval_2018_task_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sem_eval_2018_task_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/5.98M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

{'ID': '2017-En-21441', 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'anger': False, 'anticipation': True, 'disgust': False, 'fear': False, 'joy': False, 'love': False, 'optimism': True, 'pessimism': False, 'sadness': False, 'surprise': False, 'trust': True}


In [None]:
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

# Convert each Hugging Face Dataset to a pandas DataFrame
train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)
validation_df = pd.DataFrame(validation_dataset)

# Concatenate all the splits into one DataFrame
sem_eval18 = pd.concat([train_df, test_df, validation_df], ignore_index=True)


In [None]:
sem_eval18.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,False,True,False,False,False,False,True,False,False,False,True
1,2017-En-31535,Whatever you decide to do make sure it makes y...,False,False,False,False,True,True,True,False,False,False,False
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,True,False,True,False,True,False,True,False,False,False,False
3,2017-En-31436,Accept the challenges so that you can literall...,False,False,False,False,True,False,True,False,False,False,False
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,True,False,True,False,False,False,False,False,False,False,False


In [None]:
# List the emotion columns
emotion_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Create an empty DataFrame to hold the transformed data
rows = []

# Iterate over each row in the dataset
for index, row in sem_eval18.iterrows():
    # For each row, check which emotions are True
    for emotion in emotion_columns:
        if row[emotion]:  # If the emotion is True
            # Create a new row with the ID, content, and the current emotion as label
          rows.append({
                'ID': row['ID'],
                'content': row['Tweet'],
                'sentiment': emotion
            })

# Remove rows where the label is 'trust', 'pessimism', or 'anticipation'
expanded_dataset = pd.DataFrame(rows)
sem_eval18 = expanded_dataset[~expanded_dataset['sentiment'].isin(['trust', 'pessimism', 'anticipation'])]


# Check the new dataset
print(sem_eval18.head())

              ID                                            content sentiment
1  2017-En-21441  “Worry is a down payment on a problem you may ...  optimism
3  2017-En-31535  Whatever you decide to do make sure it makes y...       joy
4  2017-En-31535  Whatever you decide to do make sure it makes y...      love
5  2017-En-31535  Whatever you decide to do make sure it makes y...  optimism
6  2017-En-21068  @Max_Kellerman  it also helps that the majorit...     anger


In [None]:
# Define the mapping for emotions to numeric labels in string format
#for sem eval we have
#['anger', 'disgust', 'fear', 'joy', 'love', 'optimism',  'sadness', 'surprise']

emotion_mapping = {
    'anger': '2',
    'disgust': '11',
    'fear': '14',
    'joy': '17',
    'love': '18',
    'optimism': '20',
    'sadness': '25',
    'surprise': '26'
}

# Apply the mapping to the 'label' column in your DataFrame
sem_eval18.loc[:, 'label'] = sem_eval18['sentiment'].map(emotion_mapping)

print(sem_eval18.head())

              ID                                            content sentiment  \
1  2017-En-21441  “Worry is a down payment on a problem you may ...  optimism   
3  2017-En-31535  Whatever you decide to do make sure it makes y...       joy   
4  2017-En-31535  Whatever you decide to do make sure it makes y...      love   
5  2017-En-31535  Whatever you decide to do make sure it makes y...  optimism   
6  2017-En-21068  @Max_Kellerman  it also helps that the majorit...     anger   

  label  
1    20  
3    17  
4    18  
5    20  
6     2  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sem_eval18.loc[:, 'label'] = sem_eval18['sentiment'].map(emotion_mapping)


In [None]:
cleaned_sem = clean_dataset(sem_eval18)
cleaned_sem.to_csv('cleaned_semEval18_data_with_sentiment.csv', index=False)
print("cleaned_semEval18_data_with_sentiment.csv'")

0 rows were skipped due to they are empty
cleaned_semEval18_data_with_sentiment.csv'


In [None]:
import pandas as pd

In [None]:
!ls

In [None]:
dataset_path = "/content/drive/MyDrive/Uni/Semesters/2024 Fall/CS 7650 (NLP)/NLP Project/Datasets/cleaned_full_isear_data_with_sentiment.csv"
df = pd.read_csv(dataset_path)

In [None]:
df

In [None]:
def check_class_imbalance(df):
    sentiment_distribution = df['sentiment'].value_counts()
    print("Sentiment Distribution:")
    print(sentiment_distribution)

    # Plotting the sentiment distribution for a visual representation
    sentiment_distribution.plot(kind='bar', title='Sentiment Distribution', ylabel='Count')

# Example usage with your dataset
check_class_imbalance(df)