# Importing The Required Libraries

In [154]:
import pandas as pd
import numpy as np
import textblob
import sklearn
from sklearn.metrics import accuracy_score
#!pip install textblob
#!python -m textblob.download_corpora
data = pd.read_csv('F:/Datasets/NLP/Project 2/Text_Emotion.csv')

data.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,‚òπÔ∏è
1,cannot remember little mermaid feeling carefre...,üôÇ
2,not feeling super well turns cold knocked next...,üôÇ
3,feel honored part group amazing talents,üôÇ
4,think helping also began feel pretty lonely lo...,‚òπÔ∏è


In [155]:
# Cleaning The Emojis with 'Happy': 1, 'Sad': 0
data['emotion'] = data['emotion'].map({
    '‚òπÔ∏è' : 0,
    'üôÇ' : 1
})

data.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,0
1,cannot remember little mermaid feeling carefre...,1
2,not feeling super well turns cold knocked next...,1
3,feel honored part group amazing talents,1
4,think helping also began feel pretty lonely lo...,0


# Preprocessing

In [156]:
# Making all text lower case
data['text'] = data['text'].apply(lambda x: x.lower())

# Removing all punctuations and stop words
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')

#Stop words
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

#Punctuations
from string import punctuation
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in punctuation))

data.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,0
1,cannot remember little mermaid feeling carefre...,1
2,feeling super well turns cold knocked next thr...,1
3,feel honored part group amazing talents,1
4,think helping also began feel pretty lonely lo...,0


In [157]:
# Lemmatisation
#nltk.download('wordnet')
#nltk.download('omw-1.4')
from textblob import Word
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Correcting letter repetitions
import re
def no_repeats(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['text'] = data['text'].apply(lambda x: " ".join(no_repeats(x) for x in x.split()))

data.head()

Unnamed: 0,text,emotion
0,carefully word blog post amount criticism hear...,0
1,cannot remember little mermaid feeling carefre...,1
2,feeling super well turn cold knocked next thre...,1
3,feel honored part group amazing talent,1
4,think helping also began feel pretty lonely lo...,0


In [158]:
def cleaning (text):

IndentationError: expected an indented block (1931334565.py, line 1)

In [159]:
# Finding the rarest words
freq = pd.Series(' '.join(data['text']).split()).value_counts()[-10000:]

# Removing rarely appearing words
freq = list(freq.index)
data['text'] = data['text'].apply(lambda x : ' '.join(x for x in x.split() if x not in freq))

data.head()

Unnamed: 0,text,emotion
0,carefully word blog post amount criticism hear...,0
1,cannot remember little mermaid feeling carefre...,1
2,feeling super well turn cold knocked next thre...,1
3,feel honored part group amazing talent,1
4,think helping also began feel pretty lonely lo...,0


# Splitting Training and Testing Data

In [166]:
from sklearn.model_selection import train_test_split

labels = data['emotion']
data1 = data['text']

x_train, x_test, y_train, y_test = train_test_split(data1, labels, test_size=0.33, shuffle=True)

x_train.head()

275410                           take tablet feel suffering
212955    feel like life shamble since accutane determin...
210948                       abstain feel dull boring small
69462     dislike used patriarchal system way really obj...
52496     suffering depression miserable job feeling pre...
Name: text, dtype: object

# Extracting The TF-IDF Parameters

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(x_train)
X_val_tfidf = tfidf.fit_transform(x_test)

# Extracting The Count Vector

In [168]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data1)
X_train_count =  count_vect.transform(x_train)
X_val_count =  count_vect.transform(x_test)

# Trying Naive Bayes Classifier

In [169]:
# With Count Vectors
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# With TF-IDF
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

naive bayes count vectors accuracy 0.9432884755496507
naive bayes count vectors accuracy 0.5540329147559251


# Trying Linear SVM (Support Vector Machine)

In [170]:
# With Count Vectors
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# With TF-IDF
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

lsvm using count vectors accuracy 0.9359705138644838
naive bayes count vectors accuracy 0.5737046243517765


# Trying Logistic Regression

In [171]:
# With Count Vectors
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_test))

# With TF-IDF
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


log reg count vectors accuracy 0.9585136930527579
naive bayes count vectors accuracy 0.5678223974628209


# Tokenization and Sequencing

In [175]:
#Tokenization & sequencing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(x_train)

sequence_train = tokenizer.texts_to_sequences(x_train)
sequence_test = tokenizer.texts_to_sequences(x_test)

In [176]:
V = len(tokenizer.index_word)
print("Tokenized %s Unique Tokens" %V)

Tokenized 50333 Unique Tokens


In [177]:
#Padding
data_train = pad_sequences(sequence_train)

T = data_train.shape[1]

data_test = pad_sequences(sequence_test, maxlen=T)

data_train

array([[   0,    0,    0, ..., 5259,    1,  362],
       [   0,    0,    0, ...,    9,    4,  111],
       [   0,    0,    0, ...,  298,  290,  526],
       ...,
       [   0,    0,    0, ...,   99,  114,  428],
       [   0,    0,    0, ..., 2670,   38,  448],
       [   0,    0,    0, ...,  922,  834, 9414]])

In [178]:
#Build Model
from tensorflow.keras.layers import Input, Conv1D, Embedding, MaxPooling1D, GlobalMaxPooling1D, Add, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Model

D = 20

i = Input(shape=(T,))
x = Embedding(V + 1 , D)(i)
x = Conv1D(32, 3, padding='same',activation=LeakyReLU(alpha=0.2))(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, padding='same', activation=LeakyReLU(alpha=0.2))(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, padding='same', activation=LeakyReLU(alpha=0.2))(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='relu')(x)

model = Model(i,x)

In [179]:
from tensorflow.keras.optimizers import Adam

model.compile(
    loss = 'binary_crossentropy',
    metrics=['accuracy'],
    optimizer=Adam(
        learning_rate=0.0002
    )
)

In [180]:
r = model.fit(
    data_train,
    y_train,
    validation_data=(data_test, y_test),
    epochs = 15
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [181]:
model.evaluate(
    data_test,
    y_test
)



[0.6289677023887634, 0.8926199078559875]