In [1]:
# REF https://thecleverprogrammer.com/2021/02/19/text-emotions-detection-with-machine-learning/
# REF https://thecleverprogrammer.com/2021/03/21/end-to-end-machine-learning-project/

In [2]:
import re 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
def read_data(file):
    data = []
    with open(file, 'r')as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

file = 'text.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Number of instances: 7480


In [4]:
def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output

In [5]:
def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() 
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)    
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features)

In [6]:
def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

In [7]:
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

In [8]:
X_all = []
y_all = []

for label, text in data:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

In [10]:
def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

In [11]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [12]:
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc, lsvc, rforest, dtree]

In [14]:
from tqdm import tqdm

In [15]:
# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in tqdm(clifs): 
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |


 25%|█████████████████████                                                               | 1/4 [00:35<01:47, 35.91s/it]

| SVC                       |         0.9067513 |     0.4512032 |


 50%|██████████████████████████████████████████                                          | 2/4 [00:39<00:34, 17.05s/it]

| LinearSVC                 |         0.9988302 |     0.5768717 |


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [01:10<00:23, 23.22s/it]

| RandomForestClassifier    |         0.9988302 |     0.5541444 |


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:20<00:00, 20.10s/it]

| DecisionTreeClassifier    |         0.9988302 |     0.4618984 |





In [16]:
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1084
anger     (0. 0. 1. 0. 0. 0. 0.)  1080
sadness   (0. 0. 0. 1. 0. 0. 0.)  1079
fear      (0. 1. 0. 0. 0. 0. 0.)  1078
disgust   (0. 0. 0. 0. 1. 0. 0.)  1057
guilt     (0. 0. 0. 0. 0. 0. 1.)  1057
shame     (0. 0. 0. 0. 0. 1. 0.)  1045


In [23]:
emoji_dict = {"joy":"😂", "fear":"😱", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😳"}
t1 = "This looks so impressive"
t2 = "I have a fear of dogs"
t3 = "My dog died yesterday"
t4 = "I don't love you anymore..!"

texts = [t1, t2, t3, t4]
for text in texts:
#     clf = dtree
#     clf_name = clf.__class__.__name__
#     print(clf_name)
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print( text,emoji_dict[prediction])

This looks so impressive 😳
I have a fear of dogs 😱
My dog died yesterday 😢
I don't love you anymore..! 😂


In [24]:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['This is the first document.', 
          'This document is the second document.',
          'And this is the third one.', 
          'Is this the first document?']
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)

(4, 16)


In [25]:
X

<4x16 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [26]:
print(X)

  (0, 0)	-0.5773502691896258
  (0, 8)	-0.5773502691896258
  (0, 13)	0.5773502691896258
  (0, 14)	0.0
  (1, 0)	-0.8164965809277261
  (1, 11)	0.4082482904638631
  (1, 13)	0.4082482904638631
  (1, 14)	0.0
  (2, 4)	-0.7071067811865475
  (2, 5)	0.7071067811865475
  (2, 13)	0.0
  (2, 14)	0.0
  (3, 0)	-0.5773502691896258
  (3, 8)	-0.5773502691896258
  (3, 13)	0.5773502691896258
  (3, 14)	0.0


In [27]:
from sklearn.cluster import KMeans
import numpy as np

X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

array([1, 1, 1, 0, 0, 0])

In [28]:
kmeans.predict([[0, 0], [12, 3]])

array([1, 0])

In [29]:
kmeans.cluster_centers_

array([[10.,  2.],
       [ 1.,  2.]])

In [30]:
from sklearn.cluster import MiniBatchKMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [4, 2], [4, 0], [4, 4],
              [4, 5], [0, 1], [2, 2],
              [3, 2], [5, 5], [1, -1]])
# manually fit on batches
kmeans = MiniBatchKMeans(n_clusters=2,
                         random_state=0,
                         batch_size=6)
kmeans = kmeans.partial_fit(X[0:6,:])
kmeans = kmeans.partial_fit(X[6:12,:])
kmeans.cluster_centers_

  "MiniBatchKMeans is known to have a memory leak on "


array([[2. , 1. ],
       [3.5, 4.5]])

In [31]:
kmeans.predict([[0, 0], [4, 4]])

array([0, 1])

In [32]:
# fit on the whole data
kmeans = MiniBatchKMeans(n_clusters=2,
                         random_state=0,
                         batch_size=6,
                         max_iter=10).fit(X)
kmeans.cluster_centers_

  "MiniBatchKMeans is known to have a memory leak on "


array([[1.19354839, 1.22580645],
       [4.03125   , 2.46875   ]])

In [33]:
kmeans.predict([[0, 0], [4, 4]])

array([0, 1])