In [2]:
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
X_train, Y_train = read_csv('emoji_ukr.csv')

In [4]:
for idx in range(10):
    print(X_train[idx], label_to_emoji(Y_train[idx]))

Французький макарон такий смачний 🍴
робота жахлива 😞
я засмучений 😞
кинути м'яч ⚾
Гарний жарт 😄
яка ваша улюблена гра в бейсбол ⚾
Я готувала м’ясо 🍴
припиніть возитися 😞
Я хочу китайську їжу 🍴
Давайте підемо грати в бейсбол ⚾


In [5]:
Y_oh_train = convert_to_one_hot(Y_train, C = 5)

In [6]:
idx = 50
print(f"Sentence '{X_train[idx]}' has label index {Y_train[idx]}, which is emoji {label_to_emoji(Y_train[idx])}", )
print(f"Label index {Y_train[idx]} in one-hot encoding format is {Y_oh_train[idx]}")

Sentence 'він зробив дивовижну роботу' has label index 2, which is emoji 😄
Label index 2 in one-hot encoding format is [0. 0. 1. 0. 0.]


In [7]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('fiction.lowercased.tokenized.glove.300d')

In [8]:
word = "батьківщина"
idx = 2898
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(idx) + "th word in the vocabulary is", index_to_word[idx])

the index of батьківщина in the vocabulary is 2733
the 2898th word in the vocabulary is безбожно


In [9]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.lower().split()

    avg = np.zeros(word_to_vec_map['привіт'].shape)
    
    total = 0
    for w in words:
        try:
            total += word_to_vec_map[w]
        except:
            total += avg
    avg = total / len(words)
    
    return avg

In [10]:
avg = sentence_to_avg("Я люблю свою неньку", word_to_vec_map)
print("avg = \n", avg)

avg = 
 [-6.8779000e-02  1.9953000e-02 -2.9009950e-01  8.7509500e-02
  1.4155000e-02  2.7551550e-01  7.7740250e-02  6.6722750e-02
  1.4309525e-01 -1.6189925e-01 -3.7249525e-01 -1.2387125e-01
 -1.7837375e-01  1.0524475e+00  2.8254650e-01 -7.0859250e-02
 -1.0817500e-02 -2.9021650e-01  3.7671000e-02  2.6803300e-01
 -1.4620375e-01  2.6981500e-01 -1.7425450e-01 -1.5196750e-01
 -2.6725900e-01  3.0173000e-02 -3.8939900e-01 -8.9320000e-03
 -4.0813500e-01 -1.4401500e-01  3.5114375e-01 -3.3192025e-01
 -4.5284275e-01  2.2350500e-01  2.5134125e-01  3.1897275e-01
 -1.1154250e-01 -2.9278350e-01  1.1589250e-02  5.4871825e-01
 -6.3967175e-01  1.6656225e-01 -3.6987075e-01  1.6039700e-01
  1.9606150e-01 -3.0023325e-01  7.7144250e-02 -6.4164000e-02
  3.8528000e-02  8.9644750e-02 -1.9821750e-02  2.6080975e-01
 -9.0121000e-02  2.7288600e-01 -6.8264300e-01  1.1704150e-01
  2.0684475e-01 -3.4356425e-01  7.0288750e-02 -5.5021000e-02
 -3.6088750e-02 -6.4270750e-02 -2.2802775e-01  1.6983375e-01
  1.1958700e-01 

In [12]:
def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
        
    np.random.seed(1)

    m = Y.shape[0]                          # number of training examples
    n_y = 5                                 # number of classes  
    n_h = 300                                # dimensions of the GloVe vectors 
    
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    Y_oh = convert_to_one_hot(Y, C = n_y) 
    
    for t in range(num_iterations): 
        for i in range(m):         
            
            avg = sentence_to_avg(X[i], word_to_vec_map)

            z = np.dot(W, avg) + b
            a = softmax(z)

            cost = -np.sum(Y_oh[i] * np.log(a))
            
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz

            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map) #predict is defined in emo_utils.py

    return pred, W, b

In [13]:
pred, W, b = model(X_train, Y_train, word_to_vec_map)
print(pred)

Epoch: 0 --- cost = 1.1038890515158137
Accuracy: 0.3989071038251366
Epoch: 100 --- cost = 0.40175629021942416
Accuracy: 0.9617486338797814
Epoch: 200 --- cost = 0.2621279296696463
Accuracy: 0.9836065573770492
Epoch: 300 --- cost = 0.18762609102630362
Accuracy: 0.9890710382513661
[[4.]
 [3.]
 [3.]
 [1.]
 [2.]
 [1.]
 [4.]
 [2.]
 [4.]
 [1.]
 [3.]
 [3.]
 [2.]
 [2.]
 [4.]
 [3.]
 [2.]
 [3.]
 [3.]
 [1.]
 [3.]
 [2.]
 [2.]
 [2.]
 [0.]
 [1.]
 [0.]
 [4.]
 [2.]
 [0.]
 [2.]
 [0.]
 [0.]
 [3.]
 [4.]
 [0.]
 [2.]
 [1.]
 [3.]
 [1.]
 [0.]
 [4.]
 [0.]
 [3.]
 [0.]
 [4.]
 [2.]
 [3.]
 [4.]
 [2.]
 [2.]
 [3.]
 [0.]
 [2.]
 [2.]
 [3.]
 [2.]
 [3.]
 [2.]
 [2.]
 [3.]
 [3.]
 [0.]
 [2.]
 [3.]
 [0.]
 [2.]
 [0.]
 [0.]
 [2.]
 [3.]
 [2.]
 [4.]
 [1.]
 [3.]
 [3.]
 [0.]
 [0.]
 [3.]
 [2.]
 [0.]
 [3.]
 [0.]
 [2.]
 [2.]
 [4.]
 [2.]
 [2.]
 [0.]
 [0.]
 [2.]
 [3.]
 [0.]
 [4.]
 [2.]
 [1.]
 [2.]
 [3.]
 [3.]
 [2.]
 [3.]
 [0.]
 [3.]
 [0.]
 [2.]
 [0.]
 [2.]
 [3.]
 [4.]
 [3.]
 [1.]
 [3.]
 [4.]
 [3.]
 [2.]
 [3.]
 [3.]
 [3.]
 [1.]
 [4.]


In [15]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)

Training set:
Accuracy: 0.994535519125683


In [36]:
X_my_sentences = np.array(["звучить так смішно", "я хочу їсти", "він кохає її", "оце веселий жарт", "пішли грати футбол", "тепер я сумний", "Тарас смішний хлопчина"])
Y_my_labels = np.array([[0], [0], [2], [1], [4],[3]])

pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)
print_predictions(X_my_sentences, pred)

Accuracy: 0.0

звучить так смішно 😄
я хочу їсти 🍴
він кохає її ❤️
оце веселий жарт 😄
пішли грати футбол ⚾
тепер я сумний 😞
Тарас смішний хлопчина 😄


# LSTM

In [34]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`