In [36]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical



import re

print("Tensorflow Version",tf.__version__)

Tensorflow Version 2.16.1


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/physics/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [4]:
df_train = pd.read_csv("dataset/train.txt",
                 delimiter=';', header=None, names=['sentence','label'])

df_valid = pd.read_csv("dataset/val.txt",
                 delimiter=';', header=None, names=['sentence','label'])

df_test  = pd.read_csv("dataset/test.txt",
                 delimiter=';', header=None, names=['sentence','label'])

df_train.head()

Unnamed: 0,sentence,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df_train.label.value_counts()

label
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [7]:
df_train = df_train[~df_train['label'].str.contains('love')]
df_train = df_train[~df_train['label'].str.contains('surprise')]

df_train.label.value_counts()

label
joy        5362
sadness    4666
anger      2159
fear       1937
Name: count, dtype: int64

In [8]:
df_valid = df_valid[~df_valid['label'].str.contains('love')]
df_valid = df_valid[~df_valid['label'].str.contains('surprise')]

df_test = df_test[~df_test['label'].str.contains('love')]
df_test = df_test[~df_test['label'].str.contains('surprise')]

split onto X, y

In [9]:
train_txt = df_train['sentence']
train_label = df_train['label']

valid_txt = df_valid['sentence']
valid_label = df_valid['label']

test_txt = df_test['sentence']
test_label = df_test['label']

In [10]:
train_txt.head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
4                                 i am feeling grouchy
5    ive been feeling a little burdened lately wasn...
Name: sentence, dtype: object

In [15]:
def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    return text

def remove_stopwords(text, do_stem=False):
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if do_stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

def preprocess_text(text, do_stem=False):
    text = clean_text(text)
    text = remove_stopwords(text, do_stem)
    return text

train_txt.apply(lambda x: preprocess_text(x, do_stem=True)).head()

0                                    didnt feel humili
1    go feel hopeless damn hope around someon care ...
2                 im grab minut post feel greedi wrong
4                                         feel grouchi
5                ive feel littl burden late wasnt sure
Name: sentence, dtype: object

In [17]:
train_txt = train_txt.apply(lambda x: preprocess_text(x, do_stem=True))
valid_txt = valid_txt.apply(lambda x: preprocess_text(x, do_stem=True))
test_txt  = test_txt.apply(lambda x: preprocess_text(x, do_stem=True))

In [21]:
encoder = LabelEncoder()
train_label = encoder.fit_transform(train_label)
valid_label = encoder.transform(valid_label)
test_label = encoder.transform(test_label)

train_label[:5], valid_label[:5], test_label[:5]

(array([3, 3, 0, 0, 3]), array([3, 3, 0, 2, 2]), array([3, 3, 3, 2, 3]))

bag of words

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_txt)
tokenizer.word_index

tokenizer.texts_to_matrix(train_txt, mode='count')


3.0

In [37]:
train_X = tokenizer.texts_to_matrix(train_txt, mode='count')
valid_X = tokenizer.texts_to_matrix(valid_txt, mode='count')
test_X  = tokenizer.texts_to_matrix(test_txt, mode='count')

train_y = to_categorical(train_label)
valid_y = to_categorical(valid_label)
test_y = to_categorical(test_label)


In [38]:
train_X.shape, valid_X.shape, test_X.shape

((14124, 9541), (1741, 9541), (1775, 9541))

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the architecture of the DNN
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(train_X.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_X, train_y, validation_data=(valid_X, valid_y), epochs=10, batch_size=32, callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])

# Evaluate the model
test_loss, test_acc = model.evaluate(test_X, test_y)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


2024-07-11 01:02:48.174874: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 539028336 exceeds 10% of free system memory.


[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6007 - loss: 0.9877 - val_accuracy: 0.8817 - val_loss: 0.3411
Epoch 2/10
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9532 - loss: 0.1487 - val_accuracy: 0.8903 - val_loss: 0.3270
Epoch 3/10
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9789 - loss: 0.0708 - val_accuracy: 0.8914 - val_loss: 0.3562
Epoch 4/10
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9865 - loss: 0.0448 - val_accuracy: 0.8914 - val_loss: 0.4201
Epoch 5/10
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9900 - loss: 0.0298 - val_accuracy: 0.8834 - val_loss: 0.4572
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 839us/step - accuracy: 0.8676 - loss: 0.4768
Test Loss: 0.4580456018447876
Test Accuracy: 0.8805633783340454
