## Let's import some basic packages

In [4]:
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub

In [5]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

embed = hub.Module(module_url)

## And here's an basic example of how embeddings work

In [11]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(messages))

    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
        print("Message: {}".format(messages[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in message_embedding[:3])
        )
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: Elephant
Embedding size: 512
Embedding: [0.04498474299907684, -0.05743392929434776, 0.0022114713210612535, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.05568016692996025, -0.009607917629182339, 0.006246286444365978, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [0.038749393075704575, 0.0765201598405838, -0.0007945735123939812, ...]



In [13]:
print(message_embeddings)

print(message_embeddings.shape)

[[ 0.04498474 -0.05743393  0.00221147 ...  0.0654638  -0.00625258
  -0.06391631]
 [ 0.05568017 -0.00960792  0.00624629 ...  0.05877164  0.01460921
  -0.04916354]
 [ 0.03874939  0.07652016 -0.00079457 ... -0.02294365  0.05577222
  -0.03522219]]
(3, 512)


In [15]:
tf.keras.utils.to_categorical([1, 2])

array([[0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

## Process real data

#### get panda table

In [17]:
import pandas as pd

In [18]:
table = pd.read_csv("./data.csv")

In [25]:
table.head()

Unnamed: 0.1,Unnamed: 0,good,bad
0,0,The Adventures of Pinocchio,The Advent
1,1,CHAPTER 1,CHA
2,2,"How it happened that Mastro Cherry, carpenter,...","How it happened that Mastro Cherry, carpenter,..."
3,3,Centuries ago there lived--,Centuries ago there li
4,4,"""A king!"" my little readers will say immediately.","""A king!"""


#### get numpy array

In [67]:
good_list = table['good'].values
bad_list = table['bad'].values

print(good_list[0])
print(len(good_list))

print(bad_list[0])
print(len(bad_list))

The Adventures of Pinocchio
1792
The Advent
1792


In [68]:
sentences = np.append(good_list, bad_list)
print(len(sentences))

3584


In [69]:
print(np.ones(3))

print(np.ones(sentences.shape))

[1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]


In [71]:
labels = np.append(
    np.ones(good_list.shape),
    np.zeros(bad_list.shape)
)
print(len(labels))

print(labels[0])
print(labels[-1])

3584
1.0
0.0


#### shuffle `input array(sentences)` and `output array(labels)` 

In [65]:
from sklearn.utils import shuffle

a = [1, 2, 3]
b = [4, 5, 6]

x, y = shuffle(a, b)

print(x)
print(y)

[3, 2, 1]
[6, 5, 4]


In [73]:
shuffled_sentences, shuffled_labels = shuffle(sentences, labels)

print(shuffled_labels)

[0. 1. 1. ... 0. 1. 1.]


#### get final `output array(labels)` 

In [77]:
sentence_labels = tf.keras.utils.to_categorical(shuffled_labels)

print(final_labels)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]


#### get final `input array(sentences)` 

In [75]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    sentence_embeddings = session.run(embed(sentences))

print(sentence_embeddings)

[[ 0.01768074  0.04292876  0.01221054 ...  0.06681651  0.0252389
  -0.03233733]
 [ 0.03537038  0.05040617 -0.05640709 ...  0.01962757 -0.03646285
  -0.03569299]
 [ 0.01679263  0.06597435 -0.04513824 ...  0.10513969  0.0446333
   0.03695172]
 ...
 [ 0.04453792  0.07351119  0.03394969 ...  0.09634716  0.03060382
  -0.03675625]
 [ 0.03939704  0.06095734  0.08138795 ...  0.01159485  0.00930689
  -0.0019066 ]
 [ 0.06539717 -0.04125939  0.01413171 ...  0.06832977 -0.01069579
   0.05199919]]


####  pickling all the data

In [78]:
import pickle

embeddings_file = "embeddings.pickle"
labels_file = "labels.pickle"

pickle.dump(sentence_embeddings, open(embeddings_file, 'wb'))
pickle.dump(sentence_labels, open(labels_file, 'wb'))