<a href="https://colab.research.google.com/github/therisbh/Emotion-Recognition/blob/main/nlp_emotion_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# You can install Packages

! pip install nltk
!pip install -U scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.8.0


In [2]:
# Downloading Corpus from NLTK

import nltk
from nltk.corpus import treebank

# Download the required dataset
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
# Load the Treebank corpus for training
treebank_sents = treebank.tagged_sents()

# Printing a sample
treebank_sents[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [4]:
# Using nltk tools

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [6]:
#splitting data into train and test
random.seed(1234)
train_dataset, test_dataset = train_test_split(treebank_sents,test_size=0.2)

print(len(train_dataset))
print(len(test_dataset))
print(train_dataset[:5])

3131
783
[[('Could', 'MD'), ('rising', 'VBG'), ('volatility', 'NN'), ('possibly', 'RB'), ('be', 'VB'), ('related', 'VBN'), ('to', 'TO'), ('uncertainty', 'NN'), ('about', 'IN'), ('the', 'DT'), ('economics', 'NNS'), ('of', 'IN'), ('stocks', 'NNS'), (',', ','), ('instead', 'RB'), ('of', 'IN'), ('the', 'DT'), ('evil', 'JJ'), ('deeds', 'NNS'), ('of', 'IN'), ('program-trading', 'NN'), ('goblins', 'NNS'), ('?', '.')], [('Columbia', 'NNP'), ('has', 'VBZ'), ('only', 'RB'), ('about', 'IN'), ('10', 'CD'), ('million', 'CD'), ('common', 'JJ'), ('shares', 'NNS'), ('in', 'IN'), ('public', 'JJ'), ('hands', 'NNS'), ('.', '.')], [('Another', 'DT'), ('OTC', 'NNP'), ('bank', 'NN'), ('stock', 'NN'), ('involved', 'VBN'), ('*', '-NONE-'), ('in', 'IN'), ('a', 'DT'), ('buy-out', 'NN'), ('deal', 'NN'), (',', ','), ('First', 'NNP'), ('Constitution', 'NNP'), ('Financial', 'NNP'), (',', ','), ('was', 'VBD'), ('higher', 'JJR'), ('.', '.')], [('Futures', 'NNS'), ('prices', 'NNS'), ('rose', 'VBD'), (',', ','), ('*-1'

In [7]:
# Extract words and tags from data

def extract_words_tags(data):
    words = []
    tags = []
    for sentence in data:
        for word, tag in sentence:
            words.append(word.lower())  # Convert to lower case
            tags.append(tag)
    return words, tags

In [8]:
train_words, train_tags = extract_words_tags(train_dataset)
test_words, test_tags = extract_words_tags(test_dataset)

In [9]:
from collections import defaultdict, Counter

# Initialize counters
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tag_counts = Counter()

In [10]:
# Populate the counts
for sentence in train_dataset:
    prev_tag = None
    for word, tag in sentence:
        emission_counts[tag][word.lower()] += 1
        tag_counts[tag] += 1
        if prev_tag is not None:
            transition_counts[prev_tag][tag] += 1
        prev_tag = tag

In [11]:
# Convert counts to probabilities
transition_probs = defaultdict(lambda: defaultdict(lambda: 1e-10))  # Small value for unseen transitions
emission_probs = defaultdict(lambda: defaultdict(lambda: 1e-10))   # Small value for unseen emissions

In [12]:
for prev_tag, next_tags in transition_counts.items():
    total_count = sum(next_tags.values())
    for next_tag, count in next_tags.items():
        transition_probs[prev_tag][next_tag] = count / total_count

In [13]:
for tag, words in emission_counts.items():
    total_count = tag_counts[tag]
    for word, count in words.items():
        emission_probs[tag][word] = count / total_count

In [14]:
def viterbi_algorithm(sentence):
    sentence = [word.lower() for word in sentence]  # Convert to lower case
    n = len(sentence)
    tags = list(tag_counts.keys())

    # Initialize the Viterbi and backpointer tables
    V = [{} for _ in range(n)]
    backpointer = [{} for _ in range(n)]

    # Initialization step
    for tag in tags:
        emission_prob = emission_probs[tag].get(sentence[0], 1e-10)
        V[0][tag] = emission_prob
        backpointer[0][tag] = None  # No backpointer for the first word

    # Dynamic programming step
    for t in range(1, n):
        for curr_tag in tags:
            max_prob, best_prev_tag = max(
                (V[t-1][prev_tag] * transition_probs[prev_tag][curr_tag] * emission_probs[curr_tag].get(sentence[t], 1e-10), prev_tag)
                for prev_tag in tags
            )
            V[t][curr_tag] = max_prob
            backpointer[t][curr_tag] = best_prev_tag

    # Backtracking step to find the most likely sequence
    best_last_tag = max(V[-1], key=V[-1].get)
    best_path = [best_last_tag]

    for t in range(n-1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])

    return best_path


In [15]:
# Predict tags for the test dataset
def predict_tags(test_dataset):
    predicted_tags = []
    true_tags = []

    for sentence in test_dataset:
        words, tags = zip(*sentence)
        predicted_sequence = viterbi_algorithm(words)

        predicted_tags.append(predicted_sequence)
        true_tags.append(tags)

    return predicted_tags, true_tags

# Calculate accuracy
def calculate_accuracy(predicted_tags, true_tags):
    correct = 0
    total = 0

    for predicted, true in zip(predicted_tags, true_tags):
        total += len(true)
        correct += sum(p == t for p, t in zip(predicted, true))

    accuracy = correct / total
    return accuracy

# Run the testing and accuracy calculation
predicted_tags, true_tags = predict_tags(test_dataset)
accuracy = calculate_accuracy(predicted_tags, true_tags)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 92.14%


In [16]:
!pip install datasets
from datasets import load_dataset



In [17]:
ds= load_dataset("dair-ai/emotion","split")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

split/train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

split/validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

split/test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [18]:
ds['train'][:10]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy',
  'ive been feeling a little burdened lately wasnt sure why that was',
  'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
  'i feel as confused about life as a teenager or as jaded as a year old man',
  'i have been with petronas for years i feel that petronas has performed well and made a huge profit',
  'i feel romantic too'],
 'label': [0, 0, 3, 2, 3, 0, 5, 4, 1, 2]}

In [19]:
#loading train, test and validation data
train_dataset=ds['train']
val_dataset=ds['validation']
test_dataset=ds['test']

In [20]:
#converting datasets to dataframes
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(val_dataset)
test_df = pd.DataFrame(test_dataset)

In [21]:
#using tfidf for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(stop_words='english', max_features=5500)

# using fit and transform on the training data, and transform validation and test data
X_train = tfidf.fit_transform(train_df['text'])
X_val = tfidf.transform(val_df['text'])
X_test = tfidf.transform(test_df['text'])

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

In [22]:
#training a classical Classifier (Naive Bayes or SVM from sklearn)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
# training the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [23]:
# prediction on validation and test datasets
val_predictions = nb_classifier.predict(X_val)
test_predictions = nb_classifier.predict(X_test)

In [24]:
# evaluation of predicted model
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print("Test Accuracy:", accuracy_score(y_test, test_predictions))


Validation Accuracy: 0.744
Test Accuracy: 0.7595


In [25]:
#classification report
print("\nValidation Classification Report:\n", classification_report(y_val, val_predictions))
print("\nTest Classification Report:\n", classification_report(y_test, test_predictions))


Validation Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.93      0.81       550
           1       0.71      0.97      0.82       704
           2       1.00      0.19      0.31       178
           3       0.96      0.55      0.70       275
           4       0.89      0.47      0.62       212
           5       0.88      0.09      0.16        81

    accuracy                           0.74      2000
   macro avg       0.86      0.53      0.57      2000
weighted avg       0.79      0.74      0.71      2000


Test Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.92      0.82       581
           1       0.72      0.98      0.83       695
           2       1.00      0.23      0.38       159
           3       0.95      0.55      0.70       275
           4       0.89      0.49      0.63       224
           5       1.00      0.03      0.06        66

    accurac

In [28]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [29]:
# POS Tagging the Emotion Dataset
def pos_tag_sentence(sentence, viterbi_algo):
    return viterbi_algo(sentence)

# Apply POS tagging to the entire dataset
train_df['pos_tags'] = train_df['text'].apply(lambda x: pos_tag_sentence(word_tokenize(x), viterbi_algorithm))
val_df['pos_tags'] = val_df['text'].apply(lambda x: pos_tag_sentence(word_tokenize(x), viterbi_algorithm))
test_df['pos_tags'] = test_df['text'].apply(lambda x: pos_tag_sentence(word_tokenize(x), viterbi_algorithm))

# Creating POS Tag Features
def extract_pos_features(pos_tags, all_tags):
    tag_freq = Counter(pos_tags)
    return [tag_freq[tag] / len(pos_tags) for tag in all_tags]

all_tags = list(tag_counts.keys())
train_pos_features = np.array([extract_pos_features(tags, all_tags) for tags in train_df['pos_tags']])
val_pos_features = np.array([extract_pos_features(tags, all_tags) for tags in val_df['pos_tags']])
test_pos_features = np.array([extract_pos_features(tags, all_tags) for tags in test_df['pos_tags']])

# Integrating POS Tag Features with TF-IDF Embeddings
X_train_combined = np.hstack([X_train.toarray(), train_pos_features])
X_val_combined = np.hstack([X_val.toarray(), val_pos_features])
X_test_combined = np.hstack([X_test.toarray(), test_pos_features])


In [30]:
# Training the Naive Bayes Classifier with Combined Features
nb_classifier.fit(X_train_combined, y_train)

# Prediction on Validation and Test Datasets
val_predictions = nb_classifier.predict(X_val_combined)
test_predictions = nb_classifier.predict(X_test_combined)

# Evaluation of the Combined Model
print("Validation Accuracy with POS Tags:", accuracy_score(y_val, val_predictions))
print("Test Accuracy with POS Tags:", accuracy_score(y_test, test_predictions))

Validation Accuracy with POS Tags: 0.7245
Test Accuracy with POS Tags: 0.7425
